From b780951c2a03d874e05f50f59359599b61ac2158 Mon Sep 17 00:00:00 2001 From: AntonieV Date: Wed, 9 Jun 2021 12:57:24 +0200 Subject: [PATCH 01/40] descriptions for report for deseq2 pca, volcano, ma and scatter plot and for fingerprint and profile plot and for heatmap --- workflow/report/plot_deseq2_FDR_1_perc_MA.rst | 6 +++++- workflow/report/plot_deseq2_FDR_1_perc_volcano.rst | 4 ++++ workflow/report/plot_deseq2_FDR_5_perc_MA.rst | 1 + workflow/report/plot_deseq2_FDR_5_perc_volcano.rst | 5 ++++- workflow/report/plot_deseq2_pca.rst | 8 +++++++- workflow/report/plot_deseq2_scatter.rst | 7 ++++++- workflow/report/plot_fingerprint_deeptools.rst | 10 +++++++++- workflow/report/plot_heatmap_deeptools.rst | 5 ++++- workflow/report/plot_profile_deeptools.rst | 5 ++++- 9 files changed, 44 insertions(+), 7 deletions(-) diff --git a/workflow/report/plot_deseq2_FDR_1_perc_MA.rst b/workflow/report/plot_deseq2_FDR_1_perc_MA.rst index 8d1c8b6..9314621 100644 --- a/workflow/report/plot_deseq2_FDR_1_perc_MA.rst +++ b/workflow/report/plot_deseq2_FDR_1_perc_MA.rst @@ -1 +1,5 @@ - +**`MA plot `_ (FDR 0.01)** +shows the log2 fold changes versus the mean of normalized counts from +`DESeq2 `_ analysis filtered on a false +discovery rate (FDR) threshold of 0.01 for all samples of each antibody. For more information about DESeq2 please see +`documentation `_. diff --git a/workflow/report/plot_deseq2_FDR_1_perc_volcano.rst b/workflow/report/plot_deseq2_FDR_1_perc_volcano.rst index 8d1c8b6..526f5d0 100644 --- a/workflow/report/plot_deseq2_FDR_1_perc_volcano.rst +++ b/workflow/report/plot_deseq2_FDR_1_perc_volcano.rst @@ -1 +1,5 @@ +**Volcano plot (FDR 0.01)** shows the significance (adjusted p-value) versus the log2 fold changes of the results of the +`DESeq2 `_ analysis filtered on a false +discovery rate (FDR) threshold of 0.01 for all samples of each antibody. For more information about DESeq2 please see +`documentation `_. diff --git a/workflow/report/plot_deseq2_FDR_5_perc_MA.rst b/workflow/report/plot_deseq2_FDR_5_perc_MA.rst index 8d1c8b6..09a79f5 100644 --- a/workflow/report/plot_deseq2_FDR_5_perc_MA.rst +++ b/workflow/report/plot_deseq2_FDR_5_perc_MA.rst @@ -1 +1,2 @@ +**`MA plot `_ (FDR 0.05)** shows the log2 fold changes versus the mean of normalized counts from `DESeq2 `_ analysis filtered on a false discovery rate (FDR) threshold of 0.05 for all samples of each antibody. For more information about DESeq2 please see `documentation https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html`_. diff --git a/workflow/report/plot_deseq2_FDR_5_perc_volcano.rst b/workflow/report/plot_deseq2_FDR_5_perc_volcano.rst index 8d1c8b6..002e15a 100644 --- a/workflow/report/plot_deseq2_FDR_5_perc_volcano.rst +++ b/workflow/report/plot_deseq2_FDR_5_perc_volcano.rst @@ -1 +1,4 @@ - +**Volcano plot (FDR 0.05)** shows the significance (adjusted p-value) versus the log2 fold changes of the results of the +`DESeq2 `_ analysis filtered on a false +discovery rate (FDR) threshold of 0.05 for all samples of each antibody. For more information about DESeq2 please see +`documentation `_. diff --git a/workflow/report/plot_deseq2_pca.rst b/workflow/report/plot_deseq2_pca.rst index 8d1c8b6..581af78 100644 --- a/workflow/report/plot_deseq2_pca.rst +++ b/workflow/report/plot_deseq2_pca.rst @@ -1 +1,7 @@ - +**`PCA plot `_ +(Principal Component Analysis)** describes variance of the +`rlog `_ or +`vst `_ transformed counts from +`DESeq2 `_ analysis with regard to +the antibody used. For more information about DESeq2 please see +`documentation `_. diff --git a/workflow/report/plot_deseq2_scatter.rst b/workflow/report/plot_deseq2_scatter.rst index 8d1c8b6..2b0fa18 100644 --- a/workflow/report/plot_deseq2_scatter.rst +++ b/workflow/report/plot_deseq2_scatter.rst @@ -1 +1,6 @@ - +**Scatter plots** uses the matrix of the +`rlog `_ or +`vst `_ transformed counts from +`DESeq2 `_ analysis for each sample +to compare them in paired combinations. For more information about DESeq2 please see +`documentation `_. diff --git a/workflow/report/plot_fingerprint_deeptools.rst b/workflow/report/plot_fingerprint_deeptools.rst index 8d1c8b6..9874c98 100644 --- a/workflow/report/plot_fingerprint_deeptools.rst +++ b/workflow/report/plot_fingerprint_deeptools.rst @@ -1 +1,9 @@ - +**deepTools `plotFingerprint `_** is a +quality control tool, which determines how well the signal in the ChIP-seq sample can be differentiated from the +background distribution of reads in the control sample. plotFingerprint randomly samples genome regions of a specified +length and sums the per-base coverage that overlap with those regions. These values are then sorted according to their +rank and the cumulative sum of read counts is plotted. With a perfect uniform distribution of reads along the genome and +infinite sequencing coverage a straight diagonal line is shown in the plot. For more information on the interpretation of +the plots please see `here `_. +For more information about plotFingerprint metrics see +`here `_. diff --git a/workflow/report/plot_heatmap_deeptools.rst b/workflow/report/plot_heatmap_deeptools.rst index 8d1c8b6..aa76279 100644 --- a/workflow/report/plot_heatmap_deeptools.rst +++ b/workflow/report/plot_heatmap_deeptools.rst @@ -1 +1,4 @@ - +**deepTools `plotHeatmap `_** creates a +heatmap for scores over sets of genomic regions and gives a visualisation for the genome-wide enrichment of the samples. +The `scores `_ represent the signal over a +set of regions where all regions are scaled to the same size. diff --git a/workflow/report/plot_profile_deeptools.rst b/workflow/report/plot_profile_deeptools.rst index 8d1c8b6..5cd87b3 100644 --- a/workflow/report/plot_profile_deeptools.rst +++ b/workflow/report/plot_profile_deeptools.rst @@ -1 +1,4 @@ - +**deepTools `plotProfile `_** creates a +profile plot for scores over sets of genomic regions and gives a visualisation for the genome-wide enrichment of the +samples. The `scores `_ represent the +signal over a set of regions where all regions are scaled to the same size. From 80d1282a803cf4c561659e7e8acac2cab514dad0 Mon Sep 17 00:00:00 2001 From: AntonieV Date: Wed, 16 Jun 2021 00:45:13 +0200 Subject: [PATCH 02/40] changes on deseq2 for comparisons of samples across the groups from each antibody, adaptations for snakemake workflow catalog, more plot descriptions for report, cleanup configs, README and ToDo's --- .test/config/config.yaml | 19 +++- .test/config/units.tsv | 34 +++--- .test/config_paired_end/config.yaml | 21 +++- .test/config_paired_end_reduced/config.yaml | 19 +++- .test/config_single_end/config.yaml | 19 +++- .test/config_single_end_reduced/config.yaml | 19 +++- README.md | 102 +----------------- config/README.md | 60 +++++++++++ config/config.yaml | 23 ++-- config/samples.tsv | 4 - config/units.tsv | 5 - .../plot_annotatepeaks_summary_homer.rst | 4 +- ...t_base_distribution_by_cycle_picard_mm.rst | 9 +- .../report/plot_consensus_peak_intersect.rst | 3 +- workflow/report/plot_deseq2_FDR_1_perc_MA.rst | 3 +- .../report/plot_deseq2_FDR_1_perc_volcano.rst | 3 +- workflow/report/plot_deseq2_FDR_5_perc_MA.rst | 8 +- .../report/plot_deseq2_FDR_5_perc_volcano.rst | 3 +- workflow/report/plot_deseq2_heatmap.rst | 8 +- .../plot_deseq2_sample_corr_heatmap.rst | 7 +- workflow/report/plot_deseq2_scatter.rst | 4 +- .../report/plot_frip_score_macs2_bedtools.rst | 3 +- .../plot_insert_size_histogram_picard_mm.rst | 7 +- workflow/report/plot_macs2_qc.rst | 7 +- workflow/report/plot_peaks_count_macs2.rst | 3 +- .../plot_quality_by_cycle_picard_mm.rst | 8 +- .../plot_quality_distribution_picard_mm.rst | 8 +- workflow/report/workflow.rst | 3 +- workflow/rules/common.smk | 21 ++-- workflow/rules/consensus_peak_analysis.smk | 74 ++++++++----- workflow/rules/filtering.smk | 10 +- workflow/rules/peak_analysis.smk | 50 ++++++--- workflow/rules/post-analysis.smk | 43 +++++--- workflow/scripts/col_mod_featurecounts.py | 3 +- workflow/scripts/featurecounts_deseq2.R | 36 +++++-- 35 files changed, 413 insertions(+), 240 deletions(-) create mode 100644 config/README.md diff --git a/.test/config/config.yaml b/.test/config/config.yaml index 1031ab7..14c186c 100644 --- a/.test/config/config.yaml +++ b/.test/config/config.yaml @@ -22,7 +22,7 @@ resources: build: GRCh38 # for testing data a specific chromosome can be selected chromosome: - # specify release version number of igenomes list to use (see https://github.com/nf-core/chipseq/releases), default: 1.2.2 + # specify release version number of igenomes list to use (see https://github.com/nf-core/chipseq/releases), e.g. 1.2.2 igenomes_release: 1.2.2 # if igenomes.yaml cannot be used, a value for the mappable or effective genome size can be specified here, e.g. macs-gsize: 2.7e9 macs-gsize: @@ -45,7 +45,7 @@ params: picard_metrics: activate: True deseq2: - # optional to run vst transform instead of rlog + # set to True to use the vst transformation instead of the rlog transformation for the DESeq2 analysis vst: True peak-annotation-analysis: activate: True @@ -53,6 +53,21 @@ params: activate: True consensus-peak-analysis: activate: True + # samtools view parameters: + # if duplicates should be removed in this filtering, add "-F 0x0400" to the params + # if for each read, you only want to retain a single (best) mapping, add "-q 1" to params + # if you would like to restrict analysis to certain regions (e.g. excluding other "blacklisted" regions), + # the -L option is automatically activated if a path to a blacklist of the given genome exists in the + # downloaded "resources/ref/igenomes.yaml" or has been provided via the parameter + # "config['resources']['ref']['blacklist']" in this configuration file + samtools-view-se: "-b -F 0x004" + samtools-view-pe: "-b -F 0x004 -G 0x009 -f 0x001" + plotfingerprint: + # Number of bins that sampled from the genome, for which the overlapping number of reads is computed for fingerprint plot + number-of-samples: 500000 + # optional parameters for picard's CollectMultipleMetrics from sorted, filtered and merged bam files in post analysis step + # see https://gatk.broadinstitute.org/hc/en-us/articles/360037594031-CollectMultipleMetrics-Picard- + collect-multiple-metrics: VALIDATION_STRINGENCY=LENIENT # TODO: move adapter parameters into a `adapter` column in units.tsv and check for its presence via the units.schema.yaml -- this enables unit-specific adapters, e.g. when integrating multiple datasets # these cutadapt parameters need to contain the required flag(s) for # the type of adapter(s) to trim, i.e.: diff --git a/.test/config/units.tsv b/.test/config/units.tsv index 857731c..d106e5e 100644 --- a/.test/config/units.tsv +++ b/.test/config/units.tsv @@ -6,36 +6,36 @@ D 1 SRR1635446 ILLUMINA E 1 SRR1635447 ILLUMINA F 1 SRR1635448 ILLUMINA G 1 SRR1635449 ILLUMINA -H 2 SRR1635450 ILLUMINA +H 1 SRR1635450 ILLUMINA I 1 SRR1635451 ILLUMINA -J 2 SRR1635452 ILLUMINA +J 1 SRR1635452 ILLUMINA K 1 SRR1635453 ILLUMINA -L 2 SRR1635454 ILLUMINA +L 1 SRR1635454 ILLUMINA M 1 SRR1635455 ILLUMINA -N 2 SRR1635456 ILLUMINA +N 1 SRR1635456 ILLUMINA O 1 SRR1635457 ILLUMINA -P 2 SRR1635458 ILLUMINA +P 1 SRR1635458 ILLUMINA Q 1 SRR1635459 ILLUMINA -R 2 SRR1635460 ILLUMINA +R 1 SRR1635460 ILLUMINA S 1 SRR1635461 ILLUMINA -T 2 SRR1635462 ILLUMINA +T 1 SRR1635462 ILLUMINA U 1 SRR1635463 ILLUMINA -V 2 SRR1635464 ILLUMINA +V 1 SRR1635464 ILLUMINA W 1 SRR1635465 ILLUMINA -X 2 SRR1635466 ILLUMINA +X 1 SRR1635466 ILLUMINA Y 1 SRR1635467 ILLUMINA -Z 2 SRR1635468 ILLUMINA +Z 1 SRR1635468 ILLUMINA AA 1 SRR1635469 ILLUMINA -AB 2 SRR1635470 ILLUMINA +AB 1 SRR1635470 ILLUMINA AC 1 SRR1635471 ILLUMINA -AD 2 SRR1635472 ILLUMINA +AD 1 SRR1635472 ILLUMINA AE 1 SRR1635473 ILLUMINA -AF 2 SRR1635474 ILLUMINA +AF 1 SRR1635474 ILLUMINA AG 1 SRR1635435 ILLUMINA -AH 2 SRR1635436 ILLUMINA +AH 1 SRR1635436 ILLUMINA AI 1 SRR1635437 ILLUMINA -AJ 2 SRR1635438 ILLUMINA +AJ 1 SRR1635438 ILLUMINA AK 1 SRR1635439 ILLUMINA -AL 2 SRR1635440 ILLUMINA +AL 1 SRR1635440 ILLUMINA AM 1 SRR1635441 ILLUMINA -AN 2 SRR1635442 ILLUMINA +AN 1 SRR1635442 ILLUMINA diff --git a/.test/config_paired_end/config.yaml b/.test/config_paired_end/config.yaml index 280c1cb..1ef8de3 100644 --- a/.test/config_paired_end/config.yaml +++ b/.test/config_paired_end/config.yaml @@ -17,9 +17,9 @@ resources: release: 101 # Genome build build: R64-1-1 - # for testing data only chromosome 21 is selected + # for testing data a single chromosome can be selected chromosome: - # specify release version number of igenomes list to use (see https://github.com/nf-core/chipseq/releases), default: 1.2.2 + # specify release version number of igenomes list to use (see https://github.com/nf-core/chipseq/releases), e.g. 1.2.2 igenomes_release: 1.2.2 # if igenomes.yaml cannot be used, a value for the mappable or effective genome size can be specified here, e.g. macs-gsize: 2.7e9 macs-gsize: @@ -42,7 +42,7 @@ params: picard_metrics: activate: True deseq2: - # optional to run vst transform instead of rlog + # set to True to use the vst transformation instead of the rlog transformation for the DESeq2 analysis vst: True peak-annotation-analysis: activate: True @@ -50,6 +50,21 @@ params: activate: True consensus-peak-analysis: activate: True + # samtools view parameters: + # if duplicates should be removed in this filtering, add "-F 0x0400" to the params + # if for each read, you only want to retain a single (best) mapping, add "-q 1" to params + # if you would like to restrict analysis to certain regions (e.g. excluding other "blacklisted" regions), + # the -L option is automatically activated if a path to a blacklist of the given genome exists in the + # downloaded "resources/ref/igenomes.yaml" or has been provided via the parameter + # "config['resources']['ref']['blacklist']" in this configuration file + samtools-view-se: "-b -F 0x004" + samtools-view-pe: "-b -F 0x004 -G 0x009 -f 0x001" + plotfingerprint: + # Number of bins that sampled from the genome, for which the overlapping number of reads is computed for fingerprint plot + number-of-samples: 500000 + # optional parameters for picard's CollectMultipleMetrics from sorted, filtered and merged bam files in post analysis step + # see https://gatk.broadinstitute.org/hc/en-us/articles/360037594031-CollectMultipleMetrics-Picard- + collect-multiple-metrics: VALIDATION_STRINGENCY=LENIENT # TODO: move adapter parameters into a `adapter` column in units.tsv and check for its presence via the units.schema.yaml -- this enables unit-specific adapters, e.g. when integrating multiple datasets # these cutadapt parameters need to contain the required flag(s) for # the type of adapter(s) to trim, i.e.: diff --git a/.test/config_paired_end_reduced/config.yaml b/.test/config_paired_end_reduced/config.yaml index 6663e58..bfcc458 100644 --- a/.test/config_paired_end_reduced/config.yaml +++ b/.test/config_paired_end_reduced/config.yaml @@ -19,7 +19,7 @@ resources: build: R64-1-1 # for testing data a specific chromosome can be selected chromosome: VII - # specify release version number of igenomes list to use (see https://github.com/nf-core/chipseq/releases), default: 1.2.2 + # specify release version number of igenomes list to use (see https://github.com/nf-core/chipseq/releases), e.g. 1.2.2 igenomes_release: 1.2.2 # if igenomes.yaml cannot be used, a value for the mappable or effective genome size can be specified here, e.g. macs-gsize: 2.7e9 macs-gsize: @@ -42,7 +42,7 @@ params: picard_metrics: activate: True deseq2: - # optional to run vst transform instead of rlog + # set to True to use the vst transformation instead of the rlog transformation for the DESeq2 analysis vst: False peak-annotation-analysis: activate: True @@ -50,6 +50,21 @@ params: activate: True consensus-peak-analysis: activate: True + # samtools view parameters: + # if duplicates should be removed in this filtering, add "-F 0x0400" to the params + # if for each read, you only want to retain a single (best) mapping, add "-q 1" to params + # if you would like to restrict analysis to certain regions (e.g. excluding other "blacklisted" regions), + # the -L option is automatically activated if a path to a blacklist of the given genome exists in the + # downloaded "resources/ref/igenomes.yaml" or has been provided via the parameter + # "config['resources']['ref']['blacklist']" in this configuration file + samtools-view-se: "-b -F 0x004" + samtools-view-pe: "-b -F 0x004 -G 0x009 -f 0x001" + plotfingerprint: + # Number of bins that sampled from the genome, for which the overlapping number of reads is computed for fingerprint plot + number-of-samples: 500000 + # optional parameters for picard's CollectMultipleMetrics from sorted, filtered and merged bam files in post analysis step + # see https://gatk.broadinstitute.org/hc/en-us/articles/360037594031-CollectMultipleMetrics-Picard- + collect-multiple-metrics: VALIDATION_STRINGENCY=LENIENT # TODO: move adapter parameters into a `adapter` column in units.tsv and check for its presence via the units.schema.yaml -- this enables unit-specific adapters, e.g. when integrating multiple datasets # these cutadapt parameters need to contain the required flag(s) for # the type of adapter(s) to trim, i.e.: diff --git a/.test/config_single_end/config.yaml b/.test/config_single_end/config.yaml index 947b992..b239756 100644 --- a/.test/config_single_end/config.yaml +++ b/.test/config_single_end/config.yaml @@ -22,7 +22,7 @@ resources: build: GRCh38 # for testing data a specific chromosome can be selected chromosome: - # specify release version number of igenomes list to use (see https://github.com/nf-core/chipseq/releases), default: 1.2.2 + # specify release version number of igenomes list to use (see https://github.com/nf-core/chipseq/releases), e.g. 1.2.2 igenomes_release: 1.2.2 # if igenomes.yaml cannot be used, a value for the mappable or effective genome size can be specified here, e.g. macs-gsize: 2.7e9 macs-gsize: @@ -45,7 +45,7 @@ params: picard_metrics: activate: True deseq2: - # optional to run vst transform instead of rlog + # set to True to use the vst transformation instead of the rlog transformation for the DESeq2 analysis vst: True peak-annotation-analysis: activate: True @@ -53,6 +53,21 @@ params: activate: True consensus-peak-analysis: activate: True + # samtools view parameters: + # if duplicates should be removed in this filtering, add "-F 0x0400" to the params + # if for each read, you only want to retain a single (best) mapping, add "-q 1" to params + # if you would like to restrict analysis to certain regions (e.g. excluding other "blacklisted" regions), + # the -L option is automatically activated if a path to a blacklist of the given genome exists in the + # downloaded "resources/ref/igenomes.yaml" or has been provided via the parameter + # "config['resources']['ref']['blacklist']" in this configuration file + samtools-view-se: "-b -F 0x004" + samtools-view-pe: "-b -F 0x004 -G 0x009 -f 0x001" + plotfingerprint: + # Number of bins that sampled from the genome, for which the overlapping number of reads is computed for fingerprint plot + number-of-samples: 500000 + # optional parameters for picard's CollectMultipleMetrics from sorted, filtered and merged bam files in post analysis step + # see https://gatk.broadinstitute.org/hc/en-us/articles/360037594031-CollectMultipleMetrics-Picard- + collect-multiple-metrics: VALIDATION_STRINGENCY=LENIENT # TODO: move adapter parameters into a `adapter` column in units.tsv and check for its presence via the units.schema.yaml -- this enables unit-specific adapters, e.g. when integrating multiple datasets # these cutadapt parameters need to contain the required flag(s) for # the type of adapter(s) to trim, i.e.: diff --git a/.test/config_single_end_reduced/config.yaml b/.test/config_single_end_reduced/config.yaml index 15630e3..1870c13 100644 --- a/.test/config_single_end_reduced/config.yaml +++ b/.test/config_single_end_reduced/config.yaml @@ -22,7 +22,7 @@ resources: build: GRCh38 # for testing data a specific chromosome can be selected chromosome: 21 - # specify release version number of igenomes list to use (see https://github.com/nf-core/chipseq/releases), default: 1.2.2 + # specify release version number of igenomes list to use (see https://github.com/nf-core/chipseq/releases), e.g. 1.2.2 igenomes_release: 1.2.2 # if igenomes.yaml cannot be used, a value for the mappable or effective genome size can be specified here, e.g. macs-gsize: 2.7e9 macs-gsize: @@ -45,7 +45,7 @@ params: picard_metrics: activate: True deseq2: - # optional to run vst transform instead of rlog + # set to True to use the vst transformation instead of the rlog transformation for the DESeq2 analysis vst: True peak-annotation-analysis: activate: True @@ -53,6 +53,21 @@ params: activate: True consensus-peak-analysis: activate: True + # samtools view parameters: + # if duplicates should be removed in this filtering, add "-F 0x0400" to the params + # if for each read, you only want to retain a single (best) mapping, add "-q 1" to params + # if you would like to restrict analysis to certain regions (e.g. excluding other "blacklisted" regions), + # the -L option is automatically activated if a path to a blacklist of the given genome exists in the + # downloaded "resources/ref/igenomes.yaml" or has been provided via the parameter + # "config['resources']['ref']['blacklist']" in this configuration file + samtools-view-se: "-b -F 0x004" + samtools-view-pe: "-b -F 0x004 -G 0x009 -f 0x001" + plotfingerprint: + # Number of bins that sampled from the genome, for which the overlapping number of reads is computed for fingerprint plot + number-of-samples: 500000 + # optional parameters for picard's CollectMultipleMetrics from sorted, filtered and merged bam files in post analysis step + # see https://gatk.broadinstitute.org/hc/en-us/articles/360037594031-CollectMultipleMetrics-Picard- + collect-multiple-metrics: VALIDATION_STRINGENCY=LENIENT # TODO: move adapter parameters into a `adapter` column in units.tsv and check for its presence via the units.schema.yaml -- this enables unit-specific adapters, e.g. when integrating multiple datasets # these cutadapt parameters need to contain the required flag(s) for # the type of adapter(s) to trim, i.e.: diff --git a/README.md b/README.md index 40495a0..39b49bf 100644 --- a/README.md +++ b/README.md @@ -1,104 +1,12 @@ # Snakemake workflow: chipseq -[![Snakemake](https://img.shields.io/badge/snakemake-≥5.14.0-brightgreen.svg)](https://snakemake.bitbucket.io) -[![Build Status](https://travis-ci.org/snakemake-workflows/chipseq.svg?branch=master)](https://travis-ci.org/snakemake-workflows/chipseq) +[![Snakemake](https://img.shields.io/badge/snakemake-≥6.4.0-brightgreen.svg)](https://snakemake.github.io) +[![GitHub actions status](https://github.com/snakemake-workflows/chipseq/workflows/Tests/badge.svg?branch=master)](https://github.com/snakemake-workflows/chipseq/actions?query=branch%3Amaster+workflow%3ATests) -This is the template for a new Snakemake workflow. Replace this text with a comprehensive description covering the purpose and domain. -Insert your code into the respective folders, i.e. `scripts`, `rules`, and `envs`. Define the entry point of the workflow in the `Snakefile` and the main configuration in the `config.yaml` file. - -## Authors - -* Antonie Vietor (@AntonieV) +This workflow is a Snakemake port of the [nextflow chipseq pipeline](https://nf-co.re/chipseq) and performs ChIP-seq peak-calling, QC and differential analysis. ## Usage -If you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this (original) repository and, if available, its DOI (see above). - -### Step 1: Obtain a copy of this workflow - -1. Create a new github repository using this workflow [as a template](https://help.github.com/en/articles/creating-a-repository-from-a-template). -2. [Clone](https://help.github.com/en/articles/cloning-a-repository) the newly created repository to your local system, into the place where you want to perform the data analysis. - -### Step 2: Configure workflow - -Configure the workflow according to your needs via editing the files in the `config/` folder. Adjust `config.yaml` to configure the workflow execution, and `samples.tsv` to specify your sample setup. - -### Step 3: Install Snakemake - -Install Snakemake using [conda](https://conda.io/projects/conda/en/latest/user-guide/install/index.html): - - conda create -c bioconda -c conda-forge -n snakemake snakemake - -For installation details, see the [instructions in the Snakemake documentation](https://snakemake.readthedocs.io/en/stable/getting_started/installation.html). - -### Step 4: Execute workflow - -Activate the conda environment: - - conda activate snakemake - -Test your configuration by performing a dry-run via - - snakemake --use-conda -n - -Execute the workflow locally via - - snakemake --use-conda --cores $N - -using `$N` cores or run it in a cluster environment via - - snakemake --use-conda --cluster qsub --jobs 100 - -or - - snakemake --use-conda --drmaa --jobs 100 - -If you not only want to fix the software stack but also the underlying OS, use - - snakemake --use-conda --use-singularity - -in combination with any of the modes above. -See the [Snakemake documentation](https://snakemake.readthedocs.io/en/stable/executable.html) for further details. - -### Step 5: Investigate results - -After successful execution, you can create a self-contained interactive HTML report with all results via: - - snakemake --report report.html - -This report can, e.g., be forwarded to your collaborators. -An example (using some trivial test data) can be seen [here](https://cdn.rawgit.com/snakemake-workflows/rna-seq-kallisto-sleuth/master/.test/report.html). - -### Step 6: Commit changes - -Whenever you change something, don't forget to commit the changes back to your github copy of the repository: - - git commit -a - git push - -### Step 7: Obtain updates from upstream - -Whenever you want to synchronize your workflow copy with new developments from upstream, do the following. - -1. Once, register the upstream repository in your local copy: `git remote add -f upstream git@github.com:snakemake-workflows/chipseq.git` or `git remote add -f upstream https://github.com/snakemake-workflows/chipseq.git` if you do not have setup ssh keys. -2. Update the upstream version: `git fetch upstream`. -3. Create a diff with the current version: `git diff HEAD upstream/master workflow > upstream-changes.diff`. -4. Investigate the changes: `vim upstream-changes.diff`. -5. Apply the modified diff via: `git apply upstream-changes.diff`. -6. Carefully check whether you need to update the config files: `git diff HEAD upstream/master config`. If so, do it manually, and only where necessary, since you would otherwise likely overwrite your settings and samples. - - -### Step 8: Contribute back - -In case you have also changed or added steps, please consider contributing them back to the original repository: - -1. [Fork](https://help.github.com/en/articles/fork-a-repo) the original repo to a personal or lab account. -2. [Clone](https://help.github.com/en/articles/cloning-a-repository) the fork to your local system, to a different place than where you ran your analysis. -3. Copy the modified files from your analysis to the clone of your fork, e.g., `cp -r workflow path/to/fork`. Make sure to **not** accidentally copy config file contents or sample sheets. Instead, manually update the example config files if necessary. -4. Commit and push your changes to your fork. -5. Create a [pull request](https://help.github.com/en/articles/creating-a-pull-request) against the original repository. - -## Testing - -Test cases are in the subfolder `.test`. They are automatically executed via continuous integration with [Github Actions](https://github.com/features/actions). +The usage of this workflow is described in the [Snakemake Workflow Catalog](https://snakemake.github.io/snakemake-workflow-catalog/?usage=snakemake-workflows/chipseq). +If you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this (original) repository and its DOI (see above). diff --git a/config/README.md b/config/README.md new file mode 100644 index 0000000..2f5541b --- /dev/null +++ b/config/README.md @@ -0,0 +1,60 @@ + +# General settings +To configure this workflow, modify ``config/config.yaml`` according to your needs, following the explanations provided in the file. + +# Sample sheet + +Add samples to `config/samples.tsv`. For each sample, the columns `sample`, `group`, `control`, and `antibody` have to be defined. +* Samples / IP (immunoprecipitations) within the same `group` represents replicates and must have the same antibody and the same control. +* Controls / Input are listed like samples, but they do not have entries in the columns for `control` and `antibody`. +* The identifiers of each control has to be noted in the column `sample`. +* For all samples, the identifiers of the corresponding controls have to be given in the `control` column (see example below). + +**Sample sheet example**: +* Samples / IP: A, B and C +* Controls / Input: D and E + +| sample | group | batch_effect | control | antibody | +|--------|--------|--------------|---------|----------| +| A | TNFa | batch1 | D | p65 | +| B | TNFa | batch2 | D | p65 | +| C | E2TNFa | batch1 | E | p65 | +| D | TNFa | batch1 | | | +| E | E2TNFa | batch1 | | | + +# Unit sheet + +For each sample, add one or more sequencing units (runs or lanes) to the unit sheet `config/units.tsv`. For each unit, the columns `sample`, `unit`, `platform` and either `fq1` (single-end reads) or `fq1` and `fq2` (paired-end reads) or `sra_accession` have to be defined. +* Each unit has a name specified in column `unit`, which can be e.g. a running number, or an actual run, lane or replicate id. +* Each unit has a `sample` name, which associates it with the biological sample it comes from. +* For single-end reads define for each unit either a path to FASTQ file (column `fq1`) or define an SRA (sequence read archive) accession (starting with e.g. ERR or SRR) by using the column `sra_accession`. +* For paired-end reads define for each unit either two paths to FASTQ files (columns `fq1`, `fq2`) or define an SRA accession (column `sra_accession`). +* In case SRA accession numbers are used, the pipeline will automatically download the corresponding reads from SRA. If both local files and SRA accession are available, the local files will be preferred. +* The platform column needs to contain the used sequencing platform (one of 'CAPILLARY', 'LS454', 'ILLUMINA', 'SOLID', 'HELICOS', 'IONTORRENT', 'ONT', 'PACBIO’). + +**Unit sheet example for single-end reads:** + +| sample | unit | fq1 | fq2 | sra_accession | platform | +|--------|------|----------------------|-----|---------------|----------| +| A | 1 | data/A-run1.fastq.gz | | | ILLUMINA | +| B | 1 | data/B-run1.fastq.gz | | | ILLUMINA | +| B | 2 | data/B-run2.fastq.gz | | | ILLUMINA | +| C | 1 | data/C-run1.fastq.gz | | | ILLUMINA | + +**Unit sheet example for paired-end reads:** + +| sample | unit | fq1 | fq2 | sra_accession | platform | +|--------|------|------------------------|------------------------|---------------|----------| +| A | 1 | data/A-run1_1.fastq.gz | data/A-run1_2.fastq.gz | | ILLUMINA | +| B | 1 | data/B-run1_1.fastq.gz | data/B-run1_2.fastq.gz | | ILLUMINA | +| B | 2 | data/B-run2_1.fastq.gz | data/B-run1_2.fastq.gz | | ILLUMINA | +| C | 1 | data/C-run1_1.fastq.gz | data/C-run1_2.fastq.gz | | ILLUMINA | + +**Unit sheet example with SRA download (single-end or paired-end reads):** + +| sample | unit | fq1 | fq2 | sra_accession | platform | +|--------|------|-----|-----|---------------|----------| +| A | 1 | | | SRR1635456 | ILLUMINA | +| B | 1 | | | SRR1635457 | ILLUMINA | +| B | 2 | | | SRR1635458 | ILLUMINA | +| C | 1 | | | SRR1635439 | ILLUMINA | diff --git a/config/config.yaml b/config/config.yaml index 4f90f2b..1fe58de 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -1,8 +1,9 @@ -# This file should contain everything to configure the workflow on a global scale. -# In case of sample based data, it should be complemented by a samples.tsv file that contains +# This file contain everything to configure the workflow on a global scale. +# The sample based data must be complemented by a samples.tsv file that contains # one row per sample. It can be parsed easily via pandas. samples: "config/samples.tsv" -units: "config/units.tsv" # to download reads from SRA the accession numbers (see https://www.ncbi.nlm.nih.gov/sra) of samples must be given in units.tsv +# to download reads from SRA the accession numbers (see https://www.ncbi.nlm.nih.gov/sra) of samples must be given in units.tsv +units: "config/units.tsv" single_end: False resources: @@ -16,9 +17,9 @@ resources: release: 101 # Genome build build: R64-1-1 - # for testing data only chromosome 21 is selected + # for testing data a specific chromosome can be selected chromosome: - # specify release version number of igenomes list to use (see https://github.com/nf-core/chipseq/releases), default: 1.2.2 + # specify release version number of igenomes list to use (see https://github.com/nf-core/chipseq/releases), e.g. 1.2.2 igenomes_release: 1.2.2 # if igenomes.yaml cannot be used, a value for the mappable or effective genome size can be specified here, e.g. macs-gsize: 2.7e9 macs-gsize: @@ -41,7 +42,7 @@ params: picard_metrics: activate: True deseq2: - # optional to run vst transform instead of rlog + # set to True to use the vst transformation instead of the rlog transformation for the DESeq2 analysis vst: False peak-annotation-analysis: activate: True @@ -53,9 +54,17 @@ params: # if duplicates should be removed in this filtering, add "-F 0x0400" to the params # if for each read, you only want to retain a single (best) mapping, add "-q 1" to params # if you would like to restrict analysis to certain regions (e.g. excluding other "blacklisted" regions), - # the -L option is automatically activated if a path to a blacklist of the given genome exists in "config/igenomes.yaml" or has been entered there + # the -L option is automatically activated if a path to a blacklist of the given genome exists in the + # downloaded "resources/ref/igenomes.yaml" or has been provided via the parameter + # "config['resources']['ref']['blacklist']" in this configuration file samtools-view-se: "-b -F 0x004" samtools-view-pe: "-b -F 0x004 -G 0x009 -f 0x001" + plotfingerprint: + # Number of bins that sampled from the genome, for which the overlapping number of reads is computed for fingerprint plot + number-of-samples: 500000 + # optional parameters for picard's CollectMultipleMetrics from sorted, filtered and merged bam files in post analysis step + # see https://gatk.broadinstitute.org/hc/en-us/articles/360037594031-CollectMultipleMetrics-Picard- + collect-multiple-metrics: VALIDATION_STRINGENCY=LENIENT # TODO: move adapter parameters into a `adapter` column in units.tsv and check for its presence via the units.schema.yaml -- this enables unit-specific adapters, e.g. when integrating multiple datasets # these cutadapt parameters need to contain the required flag(s) for # the type of adapter(s) to trim, i.e.: diff --git a/config/samples.tsv b/config/samples.tsv index 42fab3a..d9e2b76 100644 --- a/config/samples.tsv +++ b/config/samples.tsv @@ -1,5 +1 @@ sample group batch_effect control antibody -A treated batch1 D BCATENIN -B untreated batch2 D BCATENIN -C treated batch1 D TCF4 -D untreated batch2 diff --git a/config/units.tsv b/config/units.tsv index 7da20e5..3e27f79 100644 --- a/config/units.tsv +++ b/config/units.tsv @@ -1,6 +1 @@ sample unit fragment_len_mean fragment_len_sd fq1 fq2 sra_accession platform -A 1 resources/reads/a.chr21.1.fq resources/reads/a.chr21.2.fq ILLUMINA -B 1 resources/reads/b.chr21.1.fq resources/reads/b.chr21.2.fq ILLUMINA -B 2 300 14 resources/reads/b.chr21.1.fq ILLUMINA -C 1 resources/reads/a.chr21.1.fq resources/reads/a.chr21.2.fq ILLUMINA -D 1 resources/reads/b.chr21.1.fq resources/reads/b.chr21.2.fq ILLUMINA diff --git a/workflow/report/plot_annotatepeaks_summary_homer.rst b/workflow/report/plot_annotatepeaks_summary_homer.rst index 47826cc..d417353 100644 --- a/workflow/report/plot_annotatepeaks_summary_homer.rst +++ b/workflow/report/plot_annotatepeaks_summary_homer.rst @@ -1 +1,3 @@ -**HOMER** peak annotation summary plot is generated by calculating the proportion of {{snakemake.config["params"]["peak-analysis"]}} peaks assigned to genomic features by `HOMER annotatePeaks.pl `_. +**HOMER** peak annotation summary plot is generated by calculating the proportion of +{{snakemake.config["params"]["peak-analysis"]}} peaks assigned to genomic features by +`HOMER annotatePeaks.pl `_. diff --git a/workflow/report/plot_base_distribution_by_cycle_picard_mm.rst b/workflow/report/plot_base_distribution_by_cycle_picard_mm.rst index 8d1c8b6..3631d8f 100644 --- a/workflow/report/plot_base_distribution_by_cycle_picard_mm.rst +++ b/workflow/report/plot_base_distribution_by_cycle_picard_mm.rst @@ -1 +1,8 @@ - +**`Base distribution by cycle plot +`_ (Picard)** is +used as quality control for alignment-level and shows the nucleotide distribution per cycle of the bam files after +filtering, sorting, merging and removing orphans. For any cycle within reads the relative proportions of nucleotides +should reflect the AT:CG content. For all nucleotides flattish lines would be expected and any spikes would suggest a +systematic sequencing error. For more information about `collected Picard metrics +`_ please +see `documentation `_. diff --git a/workflow/report/plot_consensus_peak_intersect.rst b/workflow/report/plot_consensus_peak_intersect.rst index 652e3f1..e6d5864 100644 --- a/workflow/report/plot_consensus_peak_intersect.rst +++ b/workflow/report/plot_consensus_peak_intersect.rst @@ -1 +1,2 @@ -**MACS2 and bedtools** merged consensus {{ snakemake.wildcards.peak }} peaks plot is generated by calculating the proportion of intersection size assigned to {{ snakemake.wildcards.samples }} for {{ snakemake.wildcards.antibody }}. +**MACS2 and bedtools** merged consensus {{ snakemake.wildcards.peak }} peaks plot is generated by calculating the +proportion of intersection size assigned to {{ snakemake.wildcards.samples }} for {{ snakemake.wildcards.antibody }}. diff --git a/workflow/report/plot_deseq2_FDR_1_perc_MA.rst b/workflow/report/plot_deseq2_FDR_1_perc_MA.rst index 9314621..2c83f01 100644 --- a/workflow/report/plot_deseq2_FDR_1_perc_MA.rst +++ b/workflow/report/plot_deseq2_FDR_1_perc_MA.rst @@ -1,5 +1,6 @@ **`MA plot `_ (FDR 0.01)** shows the log2 fold changes versus the mean of normalized counts from `DESeq2 `_ analysis filtered on a false -discovery rate (FDR) threshold of 0.01 for all samples of each antibody. For more information about DESeq2 please see +discovery rate (FDR) threshold of 0.01 for pairwise comparisons of samples across the groups from a particular antibody. +For more information about DESeq2 please see `documentation `_. diff --git a/workflow/report/plot_deseq2_FDR_1_perc_volcano.rst b/workflow/report/plot_deseq2_FDR_1_perc_volcano.rst index 526f5d0..3fa72b2 100644 --- a/workflow/report/plot_deseq2_FDR_1_perc_volcano.rst +++ b/workflow/report/plot_deseq2_FDR_1_perc_volcano.rst @@ -1,5 +1,6 @@ **Volcano plot (FDR 0.01)** shows the significance (adjusted p-value) versus the log2 fold changes of the results of the `DESeq2 `_ analysis filtered on a false -discovery rate (FDR) threshold of 0.01 for all samples of each antibody. For more information about DESeq2 please see +discovery rate (FDR) threshold of 0.01 for pairwise comparisons of samples across the groups from a particular antibody. +For more information about DESeq2 please see `documentation `_. diff --git a/workflow/report/plot_deseq2_FDR_5_perc_MA.rst b/workflow/report/plot_deseq2_FDR_5_perc_MA.rst index 09a79f5..06aee7f 100644 --- a/workflow/report/plot_deseq2_FDR_5_perc_MA.rst +++ b/workflow/report/plot_deseq2_FDR_5_perc_MA.rst @@ -1,2 +1,8 @@ -**`MA plot `_ (FDR 0.05)** shows the log2 fold changes versus the mean of normalized counts from `DESeq2 `_ analysis filtered on a false discovery rate (FDR) threshold of 0.05 for all samples of each antibody. For more information about DESeq2 please see `documentation https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html`_. +**`MA plot `_ (FDR 0.05)** +shows the log2 fold changes versus the mean of normalized counts from +`DESeq2 `_ analysis filtered on a false +discovery rate (FDR) threshold of 0.05 for pairwise comparisons of samples across the groups from a particular antibody. +For more information about DESeq2 please see +`documentation `_. + diff --git a/workflow/report/plot_deseq2_FDR_5_perc_volcano.rst b/workflow/report/plot_deseq2_FDR_5_perc_volcano.rst index 002e15a..f6fe20a 100644 --- a/workflow/report/plot_deseq2_FDR_5_perc_volcano.rst +++ b/workflow/report/plot_deseq2_FDR_5_perc_volcano.rst @@ -1,4 +1,5 @@ **Volcano plot (FDR 0.05)** shows the significance (adjusted p-value) versus the log2 fold changes of the results of the `DESeq2 `_ analysis filtered on a false -discovery rate (FDR) threshold of 0.05 for all samples of each antibody. For more information about DESeq2 please see +discovery rate (FDR) threshold of 0.05 for pairwise comparisons of samples across the groups from a particular antibody. +For more information about DESeq2 please see `documentation `_. diff --git a/workflow/report/plot_deseq2_heatmap.rst b/workflow/report/plot_deseq2_heatmap.rst index 8d1c8b6..1d3397c 100644 --- a/workflow/report/plot_deseq2_heatmap.rst +++ b/workflow/report/plot_deseq2_heatmap.rst @@ -1 +1,7 @@ - +**`Heatmap plot `_** shows for each antibody a heatmap of +the `rlog `_ or +`vst `_ transformed counts from +`DESeq2 `_ analysis. For more +information about DESeq2 please see +`documentation `_. + diff --git a/workflow/report/plot_deseq2_sample_corr_heatmap.rst b/workflow/report/plot_deseq2_sample_corr_heatmap.rst index 8d1c8b6..e23f22c 100644 --- a/workflow/report/plot_deseq2_sample_corr_heatmap.rst +++ b/workflow/report/plot_deseq2_sample_corr_heatmap.rst @@ -1 +1,6 @@ - +**Correlation `heatmap plots `_** shows heatmaps of the +`rlog `_ or +`vst `_ transformed counts from +`DESeq2 `_ analysis for each +pairwise comparison of samples across the groups from a particular antibody. For more information about DESeq2 please +see `documentation `_. diff --git a/workflow/report/plot_deseq2_scatter.rst b/workflow/report/plot_deseq2_scatter.rst index 2b0fa18..4071ea8 100644 --- a/workflow/report/plot_deseq2_scatter.rst +++ b/workflow/report/plot_deseq2_scatter.rst @@ -1,6 +1,6 @@ **Scatter plots** uses the matrix of the `rlog `_ or `vst `_ transformed counts from -`DESeq2 `_ analysis for each sample -to compare them in paired combinations. For more information about DESeq2 please see +`DESeq2 `_ analysis for +pairwise comparisons of samples across the groups from a particular antibody. For more information about DESeq2 please see `documentation `_. diff --git a/workflow/report/plot_frip_score_macs2_bedtools.rst b/workflow/report/plot_frip_score_macs2_bedtools.rst index ecf3919..655c187 100644 --- a/workflow/report/plot_frip_score_macs2_bedtools.rst +++ b/workflow/report/plot_frip_score_macs2_bedtools.rst @@ -1,2 +1,3 @@ -**MACS2 FRiP score** is generated by calculating the fraction of all mapped reads that fall into the `MACS2 `_ called {{snakemake.config["params"]["peak-analysis"]}} peak regions. +**MACS2 FRiP score** is generated by calculating the fraction of all mapped reads that fall into the +`MACS2 `_ called {{snakemake.config["params"]["peak-analysis"]}} peak regions. A read must overlap a peak by at least 20% to be counted to `FRiP score `_. diff --git a/workflow/report/plot_insert_size_histogram_picard_mm.rst b/workflow/report/plot_insert_size_histogram_picard_mm.rst index 8d1c8b6..348a398 100644 --- a/workflow/report/plot_insert_size_histogram_picard_mm.rst +++ b/workflow/report/plot_insert_size_histogram_picard_mm.rst @@ -1 +1,6 @@ - +**`Insert size histogram +`_ (Picard)** is used +for validating paired-end library construction. It shows the insert size distribution versus fractions of read pairs in +each of the three orientations (FR, RF, and TANDEM) as a histogram. For more information about `collected Picard metrics +`_ please +see `documentation `_. diff --git a/workflow/report/plot_macs2_qc.rst b/workflow/report/plot_macs2_qc.rst index 8d1c8b6..9c2ba8c 100644 --- a/workflow/report/plot_macs2_qc.rst +++ b/workflow/report/plot_macs2_qc.rst @@ -1 +1,6 @@ - +**`MACS2 `_ callpeak quality control plots** show for all +samples and their associated controls the number of peaks and their distributions of peak length, fold-change, FDR and +p-value from the results of the +`MACS callpeak analysis `_. +For more information about Model-based Analysis of ChIP-Seq (MACS) please see published +`article `_. diff --git a/workflow/report/plot_peaks_count_macs2.rst b/workflow/report/plot_peaks_count_macs2.rst index 160b8c6..d83da3d 100644 --- a/workflow/report/plot_peaks_count_macs2.rst +++ b/workflow/report/plot_peaks_count_macs2.rst @@ -1 +1,2 @@ -**MACS2 peak count** is calculated from total number of {{snakemake.config["params"]["peak-analysis"]}} peaks called by `MACS2 `_. +**MACS2 peak count** is calculated from total number of {{snakemake.config["params"]["peak-analysis"]}} peaks called by +`MACS2 `_. diff --git a/workflow/report/plot_quality_by_cycle_picard_mm.rst b/workflow/report/plot_quality_by_cycle_picard_mm.rst index 8d1c8b6..5219fe1 100644 --- a/workflow/report/plot_quality_by_cycle_picard_mm.rst +++ b/workflow/report/plot_quality_by_cycle_picard_mm.rst @@ -1 +1,7 @@ - +**`Mean quality by cycle plot +`_ (Picard)** is used as +quality control for sequencing machine performance and collects the mean quality by cycle of the bam files after +filtering, sorting, merging and removing orphans. Any spikes in quality within reads may indicate technical problems +during sequencing. For more information about `collected Picard metrics +`_ please +see `documentation `_. diff --git a/workflow/report/plot_quality_distribution_picard_mm.rst b/workflow/report/plot_quality_distribution_picard_mm.rst index 8d1c8b6..4793171 100644 --- a/workflow/report/plot_quality_distribution_picard_mm.rst +++ b/workflow/report/plot_quality_distribution_picard_mm.rst @@ -1 +1,7 @@ - +**`Quality score distribution plot +`_ (Picard)** is used +as overall quality control for a library in a given run. It shows for the range of quality scores the corresponding +total numbers of bases. For more information about `collected Picard metrics +`_ please +see `documentation `_. + diff --git a/workflow/report/workflow.rst b/workflow/report/workflow.rst index 189810d..0b6ef61 100644 --- a/workflow/report/workflow.rst +++ b/workflow/report/workflow.rst @@ -1 +1,2 @@ -**ChIP-seq** peak-calling, QC and differential analysis pipeline (Snakemake port of the `nextflow pipeline `_). +**ChIP-seq** peak-calling, QC and differential analysis pipeline (Snakemake port of the +`nextflow pipeline `_). diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index b70336c..872d490 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -387,21 +387,22 @@ def all_input(wildcards): "results/deseq2/sizeFactors/{antibody}.consensus_{peak}-peaks.sizeFactors.RData", "results/deseq2/sizeFactors/{antibody}.consensus_{peak}-peaks.sizeFactors.sizeFactor.txt", "results/deseq2/results/{antibody}.consensus_{peak}-peaks.deseq2_results.txt", - "results/deseq2/FDR/{antibody}.consensus_{peak}-peaks.deseq2.FDR_0.01.results.txt", - "results/deseq2/FDR/{antibody}.consensus_{peak}-peaks.deseq2.FDR_0.05.results.txt", - "results/deseq2/FDR/{antibody}.consensus_{peak}-peaks.deseq2.FDR_0.01.results.bed", - "results/deseq2/FDR/{antibody}.consensus_{peak}-peaks.deseq2.FDR_0.05.results.bed", - "results/deseq2/plots/FDR/{antibody}.consensus_{peak}-peaks_FDR_0.01_MA_plot.pdf", - "results/deseq2/plots/FDR/{antibody}.consensus_{peak}-peaks_FDR_0.05_MA_plot.pdf", - "results/deseq2/plots/FDR/{antibody}.consensus_{peak}-peaks_FDR_0.01_volcano_plot.pdf", - "results/deseq2/plots/FDR/{antibody}.consensus_{peak}-peaks_FDR_0.05_volcano_plot.pdf", - "results/deseq2/plots/{antibody}.consensus_{peak}-peaks_sample_corr_heatmap.pdf", - "results/deseq2/plots/{antibody}.consensus_{peak}-peaks_scatter_plots.pdf" + "results/deseq2/FDR/results/FDR_0.01_{antibody}.consensus_{peak}-peaks", + "results/deseq2/FDR/results/FDR_0.05_{antibody}.consensus_{peak}-peaks", + "results/deseq2/FDR/bed_files/FDR_0.01_{antibody}.consensus_{peak}-peaks", + "results/deseq2/FDR/bed_files/FDR_0.05_{antibody}.consensus_{peak}-peaks", + "results/deseq2/comparison_plots/MA_plots/FDR_0.01_{antibody}consensus_{peak}-peaks", + "results/deseq2/comparison_plots/MA_plots/FDR_0.05_{antibody}consensus_{peak}-peaks", + "results/deseq2/comparison_plots/volcano_plots/FDR_0.01_{antibody}consensus_{peak}-peaks", + "results/deseq2/comparison_plots/volcano_plots/FDR_0.05_{antibody}consensus_{peak}-peaks", + "results/deseq2/comparison_plots/correlation_heatmaps_{antibody}consensus_{peak}-peaks", + "results/deseq2/comparison_plots/scatter_plots_{antibody}consensus_{peak}-peaks" ], peak = config["params"]["peak-analysis"], antibody = antibody ) ) + wanted_input.extend( expand( [ diff --git a/workflow/rules/consensus_peak_analysis.smk b/workflow/rules/consensus_peak_analysis.smk index 8357ace..ef0f1bd 100644 --- a/workflow/rules/consensus_peak_analysis.smk +++ b/workflow/rules/consensus_peak_analysis.smk @@ -69,7 +69,10 @@ rule plot_peak_intersect: input: "results/macs2_merged_expand/{antibody}.consensus_{peak}-peaks.boolean.intersect.txt" output: - report("results/macs2_merged_expand/plots/{antibody}.consensus_{peak}-peaks.boolean.intersect.plot.pdf", caption="../report/plot_consensus_peak_intersect.rst", category="ConsensusPeak") + report( + "results/macs2_merged_expand/plots/{antibody}.consensus_{peak}-peaks.boolean.intersect.plot.pdf", + caption="../report/plot_consensus_peak_intersect.rst", + category="ConsensusPeak") conda: "../envs/consensus_plot.yaml" log: @@ -85,7 +88,8 @@ rule create_consensus_igv: log: "logs/igv/consensus/merged_library.{antibody}.consensus_{peak}-peaks.igv.log" shell: - "find {input} -type f -name '*.consensus_{wildcards.peak}-peaks.boolean.bed' -exec echo -e 'results/IGV/consensus/{wildcards.antibody}/\"{{}}\"\t0,0,0' \; > {output} 2> {log}" + "find {input} -type f -name '*.consensus_{wildcards.peak}-peaks.boolean.bed' -exec " + "echo -e 'results/IGV/consensus/{wildcards.antibody}/\"{{}}\"\t0,0,0' \; > {output} 2> {log}" rule homer_consensus_annotatepeaks: input: @@ -133,9 +137,8 @@ rule merge_bool_and_annotatepeaks: rule feature_counts: input: - sam=lambda wc: expand(["results/filtered/{sample}.sorted.bam", "results/filtered/{control}.sorted.bam"], - sample=get_samples_of_antibody(wc.antibody), - control=get_controls_of_antibody(wc.antibody)), + sam=lambda wc: expand("results/filtered/{sample}.sorted.bam", + sample=get_samples_of_antibody(wc.antibody)), annotation="results/macs2_merged_expand/{antibody}.consensus_{peak}-peaks.boolean.saf" output: multiext("results/feature_counts/{antibody}.consensus_{peak}-peaks", @@ -170,35 +173,55 @@ rule featurecounts_deseq2: "results/feature_counts/{antibody}.consensus_{peak}-peaks_modified.featureCounts" output: dds="results/deseq2/dss_rld/{antibody}.consensus_{peak}-peaks.dds.rld.RData", - plot_pca=report("results/deseq2/plots/{antibody}.consensus_{peak}-peaks.pca_plot.pdf", #ToDo: add description to report caption + plot_pca=report("results/deseq2/plots/{antibody}.consensus_{peak}-peaks.pca_plot.pdf", caption = "../report/plot_deseq2_pca.rst", category = "DESeq2"), - plot_heatmap=report("results/deseq2/plots/{antibody}.consensus_{peak}-peaks.heatmap_plot.pdf", #ToDo: add description to report caption + plot_heatmap=report("results/deseq2/plots/{antibody}.consensus_{peak}-peaks.heatmap_plot.pdf", caption = "../report/plot_deseq2_heatmap.rst", category = "DESeq2"), pca_data="results/deseq2/pca_vals/{antibody}.consensus_{peak}-peaks.pca.vals.txt", dist_data="results/deseq2/dists/{antibody}.consensus_{peak}-peaks.sample.dists.txt", size_factors_rdata="results/deseq2/sizeFactors/{antibody}.consensus_{peak}-peaks.sizeFactors.RData", size_factors_res="results/deseq2/sizeFactors/{antibody}.consensus_{peak}-peaks.sizeFactors.sizeFactor.txt", results="results/deseq2/results/{antibody}.consensus_{peak}-peaks.deseq2_results.txt", - FDR_1_perc_res="results/deseq2/FDR/{antibody}.consensus_{peak}-peaks.deseq2.FDR_0.01.results.txt", - FDR_5_perc_res="results/deseq2/FDR/{antibody}.consensus_{peak}-peaks.deseq2.FDR_0.05.results.txt", - FDR_1_perc_bed="results/deseq2/FDR/{antibody}.consensus_{peak}-peaks.deseq2.FDR_0.01.results.bed", - FDR_5_perc_bed="results/deseq2/FDR/{antibody}.consensus_{peak}-peaks.deseq2.FDR_0.05.results.bed", - plot_FDR_1_perc_MA=report("results/deseq2/plots/FDR/{antibody}.consensus_{peak}-peaks_FDR_0.01_MA_plot.pdf", #ToDo: add description to report caption - caption = "../report/plot_deseq2_FDR_1_perc_MA.rst", category = "DESeq2-FDR"), - plot_FDR_5_perc_MA=report("results/deseq2/plots/FDR/{antibody}.consensus_{peak}-peaks_FDR_0.05_MA_plot.pdf", #ToDo: add description to report caption - caption = "../report/plot_deseq2_FDR_5_perc_MA.rst", category = "DESeq2-FDR"), - plot_FDR_1_perc_volcano=report("results/deseq2/plots/FDR/{antibody}.consensus_{peak}-peaks_FDR_0.01_volcano_plot.pdf", #ToDo: add description to report caption - caption = "../report/plot_deseq2_FDR_1_perc_volcano.rst", category = "DESeq2-FDR"), - plot_FDR_5_perc_volcano=report("results/deseq2/plots/FDR/{antibody}.consensus_{peak}-peaks_FDR_0.05_volcano_plot.pdf", #ToDo: add description to report caption - caption = "../report/plot_deseq2_FDR_5_perc_volcano.rst", category = "DESeq2-FDR"), - plot_sample_corr_heatmap=report("results/deseq2/plots/{antibody}.consensus_{peak}-peaks_sample_corr_heatmap.pdf", #ToDo: add description to report caption - caption = "../report/plot_deseq2_sample_corr_heatmap.rst", category = "DESeq2"), - plot_scatter=report("results/deseq2/plots/{antibody}.consensus_{peak}-peaks_scatter_plots.pdf", #ToDo: add description to report caption - caption = "../report/plot_deseq2_scatter.rst", category = "DESeq2") + # pairwise comparisons of samples across the groups from a particular antibody + FDR_1_perc_res=directory("results/deseq2/FDR/results/FDR_0.01_{antibody}.consensus_{peak}-peaks"), + FDR_5_perc_res=directory("results/deseq2/FDR/results/FDR_0.05_{antibody}.consensus_{peak}-peaks"), + FDR_1_perc_bed=directory("results/deseq2/FDR/bed_files/FDR_0.01_{antibody}.consensus_{peak}-peaks"), + FDR_5_perc_bed=directory("results/deseq2/FDR/bed_files/FDR_0.05_{antibody}.consensus_{peak}-peaks"), + plot_FDR_1_perc_MA=report( + directory("results/deseq2/comparison_plots/MA_plots/FDR_0.01_{antibody}consensus_{peak}-peaks"), + patterns=["{antibody}.{{group_1_vs_group_2}}.MA-plot_FDR_0.01.pdf"], + caption = "../report/plot_deseq2_FDR_1_perc_MA.rst", + category = "DESeq2-FDR"), + plot_FDR_5_perc_MA=report( + directory("results/deseq2/comparison_plots/MA_plots/FDR_0.05_{antibody}consensus_{peak}-peaks"), + patterns=["{antibody}.{{group_1_vs_group_2}.MA-plot_FDR_0.05.pdf"], + caption = "../report/plot_deseq2_FDR_5_perc_MA.rst", + category = "DESeq2-FDR"), + plot_FDR_1_perc_volcano=report( + directory("results/deseq2/comparison_plots/volcano_plots/FDR_0.01_{antibody}consensus_{peak}-peaks"), + patterns=["{antibody}.{{group_1_vs_group_2}}.volcano-plot_FDR_0.01.pdf"], + caption = "../report/plot_deseq2_FDR_1_perc_volcano.rst", + category = "DESeq2-FDR"), + plot_FDR_5_perc_volcano=report( + directory("results/deseq2/comparison_plots/volcano_plots/FDR_0.05_{antibody}consensus_{peak}-peaks"), + patterns=["{antibody}.{{group_1_vs_group_2}}.volcano-plot_FDR_0.05.pdf"], + caption = "../report/plot_deseq2_FDR_5_perc_volcano.rst", + category = "DESeq2-FDR"), + plot_sample_corr_heatmap=report( + directory("results/deseq2/comparison_plots/correlation_heatmaps_{antibody}consensus_{peak}-peaks"), + patterns=["{antibody}.{{group_1_vs_group_2}}.correlation_heatmap.pdf"], + caption = "../report/plot_deseq2_sample_corr_heatmap.rst", + category = "DESeq2"), + plot_scatter=report( + directory("results/deseq2/comparison_plots/scatter_plots_{antibody}consensus_{peak}-peaks"), + patterns=["{antibody}.{{group_1_vs_group_2}}.scatter_plots.pdf"], + caption = "../report/plot_deseq2_scatter.rst", + category = "DESeq2") threads: 2 params: - vst = config["params"]["deseq2"]["vst"] + vst = config["params"]["deseq2"]["vst"], + antibody = lambda w: w.antibody log: "logs/deseq2/{antibody}.consensus_{peak}-peaks.featureCounts.log" conda: @@ -214,4 +237,5 @@ rule create_deseq2_igv: log: "logs/igv/consensus/merged_library.{antibody}.consensus_{peak}-peaks.deseq2.FDR_0.05.igv.log" shell: - "find {input} -type f -name '*.consensus_{wildcards.peak}-peaks.deseq2.FDR_0.05.results.bed' -exec echo -e 'results/IGV/consensus/{wildcards.antibody}/deseq2/\"{{}}\"\t255,0,0' \; > {output} 2> {log}" + "find {input} -type f -name '*.consensus_{wildcards.peak}-peaks.deseq2.FDR_0.05.results.bed' -exec " + "echo -e 'results/IGV/consensus/{wildcards.antibody}/deseq2/\"{{}}\"\t255,0,0' \; > {output} 2> {log}" diff --git a/workflow/rules/filtering.smk b/workflow/rules/filtering.smk index f3ec86d..a634c31 100644 --- a/workflow/rules/filtering.smk +++ b/workflow/rules/filtering.smk @@ -4,14 +4,8 @@ rule samtools_view_filter: output: temp("results/sam-view/{sample}.bam") params: - # if duplicates should be removed in this filtering, add "-F 0x0400" to the params - # if for each read, you only want to retain a single (best) mapping, add "-q 1" to params - # if you would like to restrict analysis to certain regions (e.g. excluding other "blacklisted" regions), - # the -L option is automatically activated if a path to a blacklist of the given genome exists in the - # downloaded "resources/ref/igenomes.yaml" or has been provided via the "config/config.yaml" - # parameter "config['resources']['ref']['blacklist']" - lambda wc, input: "-b -F 0x004 {pe_params} {blacklist}".format( - pe_params="" if config["single_end"] else "-G 0x009 -f 0x001", + lambda wc, input: "{flags} {blacklist}".format( + flags=config["params"]["samtools-view-se"] if config["single_end"] else config["params"]["samtools-view-pe"], blacklist="" if len(input) == 1 else "-L {}".format(list(input)[1]) ) log: diff --git a/workflow/rules/peak_analysis.smk b/workflow/rules/peak_analysis.smk index a02e899..e21c3b3 100644 --- a/workflow/rules/peak_analysis.smk +++ b/workflow/rules/peak_analysis.smk @@ -6,9 +6,12 @@ rule plot_fingerprint: stats=expand("results/{step}/{{sample}}.sorted.{step}.stats.txt", step="bamtools_filtered" if config["single_end"] else "orph_rm_pe") - output: #ToDo: add description to report caption + output: # https://snakemake-wrappers.readthedocs.io/en/stable/wrappers/deeptools/plotfingerprint.html. - fingerprint=report("results/deeptools/{sample}-{control}.plot_fingerprint.pdf", caption="../report/plot_fingerprint_deeptools.rst", category="QC"), + fingerprint=report( + "results/deeptools/{sample}-{control}.plot_fingerprint.pdf", + caption="../report/plot_fingerprint_deeptools.rst", + category="QC"), counts="results/deeptools/{sample}-{control}.fingerprint_counts.txt", qc_metrics="results/deeptools/{sample}-{control}.fingerprint_qcmetrics.txt" log: @@ -16,7 +19,7 @@ rule plot_fingerprint: params: "--labels {sample} {control}", "--skipZeros ", - "--numberOfSamples 500000 ", # ToDo: to config? + "--numberOfSamples {} ".format(config["params"]["plotfingerprint"]["number-of-samples"]), lambda w, input: "{se_option}{fragment_size}".format( se_option="--extendReads " if config["single_end"] else "", @@ -112,7 +115,10 @@ rule sm_report_peaks_count_plot: input: get_peaks_count_plot_input() output: - report("results/macs2_callpeak/plots/plot_{peak}_peaks_count.pdf", caption="../report/plot_peaks_count_macs2.rst", category="CallPeaks") + report( + "results/macs2_callpeak/plots/plot_{peak}_peaks_count.pdf", + caption="../report/plot_peaks_count_macs2.rst", + category="CallPeaks") log: "logs/macs2_callpeak/plot_{peak}_peaks_count.log" conda: @@ -136,8 +142,9 @@ rule bedtools_intersect: rule frip_score: input: intersect="results/bedtools_intersect/{sample}-{control}.{peak}.intersected.bed", - flagstats=expand("results/{step}/{{sample}}.sorted.{step}.flagstat", step= "bamtools_filtered" if config["single_end"] - else "orph_rm_pe") + flagstats=expand( + "results/{step}/{{sample}}.sorted.{step}.flagstat", + step= "bamtools_filtered" if config["single_end"] else "orph_rm_pe") output: "results/bedtools_intersect/{sample}-{control}.{peak}.peaks_frip.tsv" log: @@ -155,7 +162,10 @@ rule sm_rep_frip_score: input: get_frip_score_input() output: - report("results/macs2_callpeak/plots/plot_{peak}_peaks_frip_score.pdf", caption="../report/plot_frip_score_macs2_bedtools.rst", category="CallPeaks") + report( + "results/macs2_callpeak/plots/plot_{peak}_peaks_frip_score.pdf", + caption="../report/plot_frip_score_macs2_bedtools.rst", + category="CallPeaks") log: "logs/bedtools/intersect/plot_{peak}_peaks_frip_score.log" conda: @@ -171,7 +181,8 @@ rule create_igv_peaks: log: "logs/igv/create_igv_peaks/merged_library.{sample}-{control}.{peak}_peaks.log" shell: - " find {input} -type f -name '*_peaks.{wildcards.peak}Peak' -exec echo -e 'results/IGV/macs2_callpeak/{wildcards.peak}/\"{{}}\"\t0,0,178' \; > {output} 2> {log}" + " find {input} -type f -name '*_peaks.{wildcards.peak}Peak' -exec " + "echo -e 'results/IGV/macs2_callpeak/{wildcards.peak}/\"{{}}\"\t0,0,178' \; > {output} 2> {log}" rule homer_annotatepeaks: input: @@ -193,9 +204,12 @@ rule homer_annotatepeaks: rule plot_macs_qc: input: get_macs2_peaks() - output: #ToDo: add description to report caption + output: summmary="results/macs2_callpeak/plots/plot_{peak}_peaks_macs2_summary.txt", - plot=report("results/macs2_callpeak/plots/plot_{peak}_peaks_macs2.pdf", caption="../report/plot_macs2_qc.rst", category="CallPeaks") + plot=report( + "results/macs2_callpeak/plots/plot_{peak}_peaks_macs2.pdf", + caption="../report/plot_macs2_qc.rst", + category="CallPeaks") params: input = lambda wc, input: ','.join(input), sample_control_combinations = ','.join(get_sample_control_peak_combinations_list()) @@ -204,14 +218,18 @@ rule plot_macs_qc: conda: "../envs/plot_macs_annot.yaml" shell: - "Rscript ../workflow/scripts/plot_macs_qc.R -i {params.input} -s {params.sample_control_combinations} -o {output.plot} -p {output.summmary} 2> {log}" + "Rscript ../workflow/scripts/plot_macs_qc.R -i {params.input} -s {params.sample_control_combinations} " + "-o {output.plot} -p {output.summmary} 2> {log}" rule plot_homer_annotatepeaks: input: get_plot_homer_annotatepeaks_input() output: #ToDo: add description to report caption summmary="results/homer/plots/plot_{peak}_annotatepeaks_summary.txt", - plot=report("results/homer/plots/plot_{peak}_annotatepeaks.pdf", caption="../report/plot_annotatepeaks_homer.rst", category="CallPeaks") + plot=report( + "results/homer/plots/plot_{peak}_annotatepeaks.pdf", + caption="../report/plot_annotatepeaks_homer.rst", + category="CallPeaks") params: input = lambda wc, input: ','.join(input), sample_control_combinations = ','.join(get_sample_control_peak_combinations_list()) @@ -220,13 +238,17 @@ rule plot_homer_annotatepeaks: conda: "../envs/plot_macs_annot.yaml" shell: - "Rscript ../workflow/scripts/plot_homer_annotatepeaks.R -i {params.input} -s {params.sample_control_combinations} -o {output.plot} -p {output.summmary} 2> {log}" + "Rscript ../workflow/scripts/plot_homer_annotatepeaks.R -i {params.input} " + "-s {params.sample_control_combinations} -o {output.plot} -p {output.summmary} 2> {log}" rule plot_sum_annotatepeaks: input: "results/homer/plots/plot_{peak}_annotatepeaks_summary.txt" output: - report("results/homer/plots/plot_{peak}_annotatepeaks_summary.pdf", caption="../report/plot_annotatepeaks_summary_homer.rst", category="CallPeaks") + report( + "results/homer/plots/plot_{peak}_annotatepeaks_summary.pdf", + caption="../report/plot_annotatepeaks_summary_homer.rst", + category="CallPeaks") log: "logs/homer/plot_{peak}_annotatepeaks_summary.log" conda: diff --git a/workflow/rules/post-analysis.smk b/workflow/rules/post-analysis.smk index 36f6a6e..da86734 100644 --- a/workflow/rules/post-analysis.smk +++ b/workflow/rules/post-analysis.smk @@ -14,7 +14,7 @@ rule collect_multiple_metrics: input: bam="results/filtered/{sample}.sorted.bam", ref="resources/ref/genome.fasta" - output: #ToDo: add descriptions to report captions + output: # Through the output file extensions the different tools for the metrics can be selected # so that it is not necessary to specify them under params with the "PROGRAM" option. # Usable extensions (and which tools they implicitly call) are listed here: @@ -33,9 +33,18 @@ rule collect_multiple_metrics: ".quality_by_cycle_metrics", ".quality_distribution_metrics", ), - report("{path}{sample}.base_distribution_by_cycle.pdf", caption="../report/plot_base_distribution_by_cycle_picard_mm.rst", category="Multiple Metrics (picard)"), - report("{path}{sample}.quality_by_cycle.pdf", caption="../report/plot_quality_by_cycle_picard_mm.rst", category="Multiple Metrics (picard)"), - report("{path}{sample}.quality_distribution.pdf", caption="../report/plot_quality_distribution_picard_mm.rst", category="Multiple Metrics (picard)") + report( + "{path}{sample}.base_distribution_by_cycle.pdf", + caption="../report/plot_base_distribution_by_cycle_picard_mm.rst", + category="Multiple Metrics (picard)"), + report( + "{path}{sample}.quality_by_cycle.pdf", + caption="../report/plot_quality_by_cycle_picard_mm.rst", + category="Multiple Metrics (picard)"), + report( + "{path}{sample}.quality_distribution.pdf", + caption="../report/plot_quality_distribution_picard_mm.rst", + category="Multiple Metrics (picard)") resources: # This parameter (default 3 GB) can be used to limit the total resources a pipeline is allowed to use, see: # https://snakemake.readthedocs.io/en/stable/snakefiles/rules.html#resources @@ -43,8 +52,8 @@ rule collect_multiple_metrics: log: "logs/picard/{path}{sample}.log" params: - # optional parameters, TODO: move to config.yaml and load from there - "VALIDATION_STRINGENCY=LENIENT " + # optional parameters + "{} ".format(config["params"]["collect-multiple-metrics"]) wrapper: "0.64.0/bio/picard/collectmultiplemetrics" @@ -110,7 +119,8 @@ rule create_igv_bigwig: log: "logs/igv/create_igv_bigwig.log" shell: - "find {input} -type f -name '*.bigWig' -exec echo -e 'results/IGV/big_wig/\"{{}}\"\t0,0,178' \; > {output} 2> {log}" + "find {input} -type f -name '*.bigWig' -exec " + "echo -e 'results/IGV/big_wig/\"{{}}\"\t0,0,178' \; > {output} 2> {log}" rule compute_matrix: input: @@ -139,10 +149,13 @@ rule compute_matrix: rule plot_profile: input: "results/deeptools/matrix_files/matrix.gz" - output: #ToDo: add description to report caption + output: # Usable output variables, their extensions and which option they implicitly call are listed here: # https://snakemake-wrappers.readthedocs.io/en/stable/wrappers/deeptools/plotprofile.html. - plot_img=report("results/deeptools/plot_profile.pdf", caption="../report/plot_profile_deeptools.rst", category="GenomicRegions"), + plot_img=report( + "results/deeptools/plot_profile.pdf", + caption="../report/plot_profile_deeptools.rst", + category="GenomicRegions"), data="results/deeptools/plot_profile_data.tab" log: "logs/deeptools/plot_profile.log" @@ -154,10 +167,13 @@ rule plot_profile: rule plot_heatmap: input: "results/deeptools/matrix_files/matrix.gz" - output: #ToDo: add description to report caption + output: # Usable output variables, their extensions and which option they implicitly call are listed here: # https://snakemake-wrappers.readthedocs.io/en/stable/wrappers/deeptools/plotheatmap.html. - heatmap_img=report("results/deeptools/heatmap.pdf", caption="../report/plot_heatmap_deeptools.rst", category="Heatmaps"), + heatmap_img=report( + "results/deeptools/heatmap.pdf", + caption="../report/plot_heatmap_deeptools.rst", + category="Heatmaps"), heatmap_matrix="results/deeptools/heatmap_matrix.tab" log: "logs/deeptools/heatmap.log" @@ -172,7 +188,10 @@ rule phantompeakqualtools: output: #ToDo: add description to report caption res_phantom="results/phantompeakqualtools/{sample}.phantompeak.spp.out", r_data="results/phantompeakqualtools/{sample}.phantompeak.Rdata", - plot=report("results/phantompeakqualtools/{sample}.phantompeak.pdf", caption="../report/plot_phantompeak_phantompeakqualtools.rst", category="Phantompeak") + plot=report( + "results/phantompeakqualtools/{sample}.phantompeak.pdf", + caption="../report/plot_phantompeak_phantompeakqualtools.rst", + category="Phantompeak") threads: 8 log: diff --git a/workflow/scripts/col_mod_featurecounts.py b/workflow/scripts/col_mod_featurecounts.py index 2e6a8b8..8256e66 100644 --- a/workflow/scripts/col_mod_featurecounts.py +++ b/workflow/scripts/col_mod_featurecounts.py @@ -11,10 +11,11 @@ def get_group_control_combination(bam_path): sample = os.path.basename(bam_path.split(".sorted.bam")[0]) sample_row = samples.loc[samples["sample"] == sample] group = sample_row["group"].iloc[0] + antibody = sample_row["antibody"].iloc[0] control = sample_row["control"].iloc[0] if pd.isnull(control): control = sample - return "{}_{}_{}".format(group, control, sample) + return "{}_{}_{}_{}".format(group, antibody, control, sample) def modify_header(old_header): diff --git a/workflow/scripts/featurecounts_deseq2.R b/workflow/scripts/featurecounts_deseq2.R index 0b178d7..2c0766a 100644 --- a/workflow/scripts/featurecounts_deseq2.R +++ b/workflow/scripts/featurecounts_deseq2.R @@ -285,16 +285,28 @@ if (file.exists(ResultsFile) == FALSE) { ## WRITE RESULTS FILE if (MIN_FDR == 0.01) { - CompResultsFile <- snakemake@output[["FDR_1_perc_res"]] - CompBEDFile <- snakemake@output[["FDR_1_perc_bed"]] - MAplotFile <- snakemake@output[["plot_FDR_1_perc_MA"]] # AVI: added to create separate pdf files - VolcanoPlotFile <- snakemake@output[["plot_FDR_1_perc_volcano"]] # AVI: added to create separate pdf files + # AVI: dynamically file name extensions added + dirs <- c(snakemake@output[["FDR_1_perc_res"]],snakemake@output[["FDR_1_perc_bed"]],snakemake@output[["plot_FDR_1_perc_MA"]], + snakemake@output[["plot_FDR_1_perc_volcano"]],snakemake@output[["FDR_5_perc_res"]],snakemake@output[["FDR_5_perc_bed"]], + snakemake@output[["plot_FDR_5_perc_MA"]],snakemake@output[["plot_FDR_5_perc_volcano"]],snakemake@output[["plot_sample_corr_heatmap"]], + snakemake@output[["plot_scatter"]]) + for (dir in dirs) { + if (file.exists(dir) == FALSE) { + dir.create(dir,recursive=TRUE) + } + } + + CompResultsFile <- paste(snakemake@output[["FDR_1_perc_res"]], "/", snakemake@params[["antibody"]], ".", CompPrefix, ".FDR_0.01_results.txt", sep="") + CompBEDFile <- paste(snakemake@output[["FDR_1_perc_bed"]], "/", snakemake@params[["antibody"]], ".", CompPrefix, ".FDR_0.01_results.bed", sep="") + MAplotFile <- paste(snakemake@output[["plot_FDR_1_perc_MA"]], "/", snakemake@params[["antibody"]], ".", CompPrefix, ".MA-plot_FDR_0.01.pdf", sep="") # AVI: added to create separate pdf files + VolcanoPlotFile <- paste(snakemake@output[["plot_FDR_1_perc_volcano"]], "/", snakemake@params[["antibody"]], ".", CompPrefix, ".volcano_FDR_0.01.pdf", sep="") # AVI: added to create separate pdf files } if (MIN_FDR == 0.05) { - CompResultsFile <- snakemake@output[["FDR_5_perc_res"]] - CompBEDFile <- snakemake@output[["FDR_5_perc_bed"]] - MAplotFile <- snakemake@output[["plot_FDR_5_perc_MA"]] # AVI: added to create separate pdf files - VolcanoPlotFile <- snakemake@output[["plot_FDR_5_perc_volcano"]] # AVI: added to create separate pdf files + # AVI: dynamically file name extensions added + CompResultsFile <- paste(snakemake@output[["FDR_5_perc_res"]], "/", snakemake@params[["antibody"]], ".", CompPrefix, ".FDR_0.05_results.txt", sep="") + CompBEDFile <- paste(snakemake@output[["FDR_5_perc_bed"]], "/", snakemake@params[["antibody"]], ".", CompPrefix, ".FDR_0.05_results.bed", sep="") + MAplotFile <- paste(snakemake@output[["plot_FDR_5_perc_MA"]], "/", snakemake@params[["antibody"]], ".", CompPrefix, ".MA-plot_FDR_0.05.pdf", sep="") # AVI: added to create separate pdf files + VolcanoPlotFile <- paste(snakemake@output[["plot_FDR_5_perc_volcano"]], "/", snakemake@params[["antibody"]], ".", CompPrefix, ".volcano_FDR_0.05.pdf", sep="") # AVI: added to create separate pdf files } write.table(pass.fdr.table, file=CompResultsFile, col.names=TRUE, row.names=FALSE, sep='\t', quote=FALSE) @@ -317,11 +329,12 @@ if (file.exists(ResultsFile) == FALSE) { cat("\n",file=LogFile,append=TRUE,sep="") # AVI: creates required output files with message } else { - stop("More than 2 samples treated with the same antibody are needed to calculate the FDR & LOGFC.") + warning("More than 2 samples treated with the same antibody are needed to calculate the FDR & LOGFC.") } ## SAMPLE CORRELATION HEATMAP - pdf(file=snakemake@output[["plot_sample_corr_heatmap"]],width=10,height=8) # AVI: added to create separate pdf files + CorrHeatmapFile <- paste(snakemake@output[["plot_sample_corr_heatmap"]], "/", snakemake@params[["antibody"]], ".", CompPrefix, ".correlation_heatmap.pdf", sep="") # AVI: dynamically file name extensions added + pdf(file=CorrHeatmapFile,width=10,height=8) # AVI: added to create separate pdf files rld.subset <- assay(rld)[,comp.samples] sampleDists <- dist(t(rld.subset)) sampleDistMatrix <- as.matrix(sampleDists) @@ -330,7 +343,8 @@ if (file.exists(ResultsFile) == FALSE) { dev.off() # AVI: added to create separate pdf files ## SCATTER PLOT FOR RLOG COUNTS - pdf(file=snakemake@output[["plot_scatter"]],width=10,height=8) # AVI: added to create separate pdf files + ScatterPlotFile <- paste(snakemake@output[["plot_scatter"]], "/", snakemake@params[["antibody"]], ".", CompPrefix, ".scatter_plots.pdf", sep="") # AVI: dynamically file name extensions added + pdf(file=ScatterPlotFile,width=10,height=8) # AVI: added to create separate pdf files combs <- combn(comp.samples,2,simplify=FALSE) clabels <- sapply(combs,function(x){paste(x,collapse=' & ')}) plotdat <- data.frame(x=unlist(lapply(combs, function(x){rld.subset[, x[1] ]})),y=unlist(lapply(combs, function(y){rld.subset[, y[2] ]})),comp=rep(clabels, each=nrow(rld.subset))) From ce3177df8949ede519be09812f4f5ada76b8d2c4 Mon Sep 17 00:00:00 2001 From: AntonieV Date: Wed, 16 Jun 2021 22:43:42 +0200 Subject: [PATCH 03/40] report descriptions for homer annotatePeaks and phantompeakqualtools --- workflow/report/plot_annotatepeaks_homer.rst | 4 ++++ .../report/plot_phantompeak_phantompeakqualtools.rst | 11 ++++++++++- workflow/rules/post-analysis.smk | 2 +- 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/workflow/report/plot_annotatepeaks_homer.rst b/workflow/report/plot_annotatepeaks_homer.rst index e69de29..e835d23 100644 --- a/workflow/report/plot_annotatepeaks_homer.rst +++ b/workflow/report/plot_annotatepeaks_homer.rst @@ -0,0 +1,4 @@ +**`Homer annotatePeaks `_ ** is used to annotate the peaks relative +to known genomic features. The plots of this analysis show for all samples and their associated controls peak location +relative to annotation, percentage of unique genes to closest peak an peak distribution relative to TSS +(Transcription Start Site). diff --git a/workflow/report/plot_phantompeak_phantompeakqualtools.rst b/workflow/report/plot_phantompeak_phantompeakqualtools.rst index 8d1c8b6..232ef68 100644 --- a/workflow/report/plot_phantompeak_phantompeakqualtools.rst +++ b/workflow/report/plot_phantompeak_phantompeakqualtools.rst @@ -1 +1,10 @@ - + **`Phantompeakqualtools plot `_** shows strand-shift versus +cross-correlation and computes informative enrichment and quality measures for ChIP-seq data. It also calculates the +relative (RSC) and the normalized strand cross-correlation coefficient (NSC). Datasets with NSC values much less than +1.1 tend to have low signal to noise or few peaks. This may indicate poor quality or only few binding sites. RSC values +significantly lower than 1 (< 0.8) tend to have low signal to noise. The low scores can be due to several factors and +are often due to failed and poor quality ChIP or low read sequence quality. In section "ChIP-seq processing pipeline" +of MultiQC report are additional plots across all samples for RSC, NSC and strand-shift cross-correlation. + For more information about Phantompeakqualtools please +see `documentation `_. For more information +about interpretation see published `article `_. diff --git a/workflow/rules/post-analysis.smk b/workflow/rules/post-analysis.smk index da86734..bb84f38 100644 --- a/workflow/rules/post-analysis.smk +++ b/workflow/rules/post-analysis.smk @@ -185,7 +185,7 @@ rule plot_heatmap: rule phantompeakqualtools: input: "results/filtered/{sample}.sorted.bam" - output: #ToDo: add description to report caption + output: res_phantom="results/phantompeakqualtools/{sample}.phantompeak.spp.out", r_data="results/phantompeakqualtools/{sample}.phantompeak.Rdata", plot=report( From 5cf7ca9eb605cbc2740ff8bb8d8f6594c7c4c5b5 Mon Sep 17 00:00:00 2001 From: AntonieV Date: Wed, 23 Jun 2021 14:34:32 +0200 Subject: [PATCH 04/40] create igv session file --- workflow/Snakefile | 1 + workflow/rules/common.smk | 43 +++++- workflow/rules/consensus_peak_analysis.smk | 27 ++-- workflow/rules/igv_session.smk | 35 +++++ workflow/rules/peak_analysis.smk | 8 +- workflow/rules/post-analysis.smk | 4 +- workflow/rules/qc.smk | 2 +- workflow/scripts/featurecounts_deseq2.R | 2 + workflow/scripts/igv_files_to_session.py | 149 +++++++++++++++++++++ workflow/scripts/igv_report_cleanup.py | 3 + 10 files changed, 248 insertions(+), 26 deletions(-) create mode 100644 workflow/rules/igv_session.smk create mode 100755 workflow/scripts/igv_files_to_session.py create mode 100644 workflow/scripts/igv_report_cleanup.py diff --git a/workflow/Snakefile b/workflow/Snakefile index fa54668..a434099 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -22,6 +22,7 @@ include: "rules/utils.smk" include: "rules/post-analysis.smk" include: "rules/peak_analysis.smk" include: "rules/consensus_peak_analysis.smk" +include: "rules/igv_session.smk" rule all: input: all_input diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 872d490..3971c08 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -161,6 +161,38 @@ def get_read_group(wildcards): unit=wildcards.unit, platform=units.loc[(wildcards.sample, wildcards.unit), "platform"]) +# def get_deseq2_igv_input(): +# return [file for file in os.listdir("results/deseq2/FDR/bed_files/FDR_0.05_{antibody}.consensus_{peak}-peaks")] + +def get_igv_input(): + igv_input = [] + igv_input.extend([ + "results/IGV/big_wig/merged_library.bigWig.igv.txt" + ]) + igv_input.extend( + expand( + [ + "results/IGV/macs2_callpeak-{peak}/merged_library.{sample_control_peak}_peaks.igv.txt", + + ], + sample_control_peak=get_sample_control_peak_combinations_list(), + peak=config["params"]["peak-analysis"] + ) + ) + unique_antibodies = set([antibody for antibody in samples["antibody"] if not pd.isnull(antibody)]) + for antibody in unique_antibodies: + igv_input.extend( + expand( + [ + "results/IGV/consensus/merged_library.{antibody}.consensus_{peak}-peaks.igv.txt", + "results/IGV/consensus/merged_library.{antibody}.consensus_{peak}-peaks.deseq2.FDR_0.05.igv.txt" + ], + antibody=antibody, + peak=config["params"]["peak-analysis"] + ) + ) + return igv_input + def get_multiqc_input(wildcards): multiqc_input = [] for (sample, unit) in units.index: @@ -277,9 +309,10 @@ def all_input(wildcards): wanted_input = [] - # QC with fastQC and multiQC + # QC with fastQC and multiQC, igv session wanted_input.extend([ - "results/qc/multiqc/multiqc.html" + "results/qc/multiqc/multiqc.html", + "results/IGV/igv_session.xml" ]) # trimming reads @@ -312,7 +345,7 @@ def all_input(wildcards): wanted_input.extend( expand ( [ - "results/IGV/big_wig/merged_library.bigWig.igv.txt", + # "results/IGV/big_wig/merged_library.bigWig.igv.txt", "results/phantompeakqualtools/{sample}.phantompeak.pdf" ], sample = sample @@ -364,7 +397,7 @@ def all_input(wildcards): [ "results/macs2_merged_expand/{antibody}.consensus_{peak}-peaks.boolean.saf", "results/macs2_merged_expand/plots/{antibody}.consensus_{peak}-peaks.boolean.intersect.plot.pdf", - "results/IGV/consensus/merged_library.{antibody}.consensus_{peak}-peaks.igv.txt" + # "results/IGV/consensus/merged_library.{antibody}.consensus_{peak}-peaks.igv.txt" ], peak = config["params"]["peak-analysis"], antibody = antibody @@ -410,7 +443,7 @@ def all_input(wildcards): "results/macs2_callpeak/{sample}-{control}.{peak}_treat_pileup.bdg", "results/macs2_callpeak/{sample}-{control}.{peak}_control_lambda.bdg", "results/macs2_callpeak/{sample}-{control}.{peak}_peaks.{peak}Peak", - "results/IGV/macs2_callpeak-{peak}/merged_library.{sample}-{control}.{peak}_peaks.igv.txt", + # "results/IGV/macs2_callpeak-{peak}/merged_library.{sample}-{control}.{peak}_peaks.igv.txt", "results/macs2_callpeak/plots/plot_{peak}_peaks_count.pdf", "results/macs2_callpeak/plots/plot_{peak}_peaks_frip_score.pdf", "results/macs2_callpeak/plots/plot_{peak}_peaks_macs2.pdf" diff --git a/workflow/rules/consensus_peak_analysis.smk b/workflow/rules/consensus_peak_analysis.smk index ef0f1bd..ce34a5f 100644 --- a/workflow/rules/consensus_peak_analysis.smk +++ b/workflow/rules/consensus_peak_analysis.smk @@ -1,3 +1,7 @@ +import os + +from snakemake import rules + rule bedtools_merge_broad: input: get_macs2_peaks() @@ -43,7 +47,7 @@ rule create_consensus_bed: input: "results/macs2_merged_expand/{antibody}.consensus_{peak}-peaks.boolean.txt" output: - "results/macs2_merged_expand/{antibody}.consensus_{peak}-peaks.boolean.bed" + report("results/macs2_merged_expand/{antibody}.consensus_{peak}-peaks.boolean.bed", category="accessory files") conda: "../envs/gawk.yaml" log: @@ -89,7 +93,9 @@ rule create_consensus_igv: "logs/igv/consensus/merged_library.{antibody}.consensus_{peak}-peaks.igv.log" shell: "find {input} -type f -name '*.consensus_{wildcards.peak}-peaks.boolean.bed' -exec " - "echo -e 'results/IGV/consensus/{wildcards.antibody}/\"{{}}\"\t0,0,0' \; > {output} 2> {log}" + "echo -e '{{}}\t0,0,0' \; > {output} 2> {log}" + # "find {input} -type f -name '*.consensus_{wildcards.peak}-peaks.boolean.bed' -exec " + # "echo -e 'results/IGV/consensus/{wildcards.antibody}/\"{{}}\"\t0,0,0' \; > {output} 2> {log}" rule homer_consensus_annotatepeaks: input: @@ -186,7 +192,11 @@ rule featurecounts_deseq2: FDR_1_perc_res=directory("results/deseq2/FDR/results/FDR_0.01_{antibody}.consensus_{peak}-peaks"), FDR_5_perc_res=directory("results/deseq2/FDR/results/FDR_0.05_{antibody}.consensus_{peak}-peaks"), FDR_1_perc_bed=directory("results/deseq2/FDR/bed_files/FDR_0.01_{antibody}.consensus_{peak}-peaks"), - FDR_5_perc_bed=directory("results/deseq2/FDR/bed_files/FDR_0.05_{antibody}.consensus_{peak}-peaks"), + FDR_5_perc_bed=report( + directory("results/deseq2/FDR/bed_files/FDR_0.05_{antibody}.consensus_{peak}-peaks"), + patterns=["{antibody}.{{group_1_vs_group_2}}.FDR_0.05_results.bed"], + category="accessory files"), + igv_FDR_5_bed="results/IGV/consensus/merged_library.{antibody}.consensus_{peak}-peaks.deseq2.FDR_0.05.igv.txt", plot_FDR_1_perc_MA=report( directory("results/deseq2/comparison_plots/MA_plots/FDR_0.01_{antibody}consensus_{peak}-peaks"), patterns=["{antibody}.{{group_1_vs_group_2}}.MA-plot_FDR_0.01.pdf"], @@ -228,14 +238,3 @@ rule featurecounts_deseq2: "../envs/featurecounts_deseq2.yaml" script: "../scripts/featurecounts_deseq2.R" - -rule create_deseq2_igv: - input: - "results/deseq2/results/{antibody}.consensus_{peak}-peaks.deseq2.FDR_0.05.results.bed" - output: - "results/IGV/consensus/merged_library.{antibody}.consensus_{peak}-peaks.deseq2.FDR_0.05.igv.txt" - log: - "logs/igv/consensus/merged_library.{antibody}.consensus_{peak}-peaks.deseq2.FDR_0.05.igv.log" - shell: - "find {input} -type f -name '*.consensus_{wildcards.peak}-peaks.deseq2.FDR_0.05.results.bed' -exec " - "echo -e 'results/IGV/consensus/{wildcards.antibody}/deseq2/\"{{}}\"\t255,0,0' \; > {output} 2> {log}" diff --git a/workflow/rules/igv_session.smk b/workflow/rules/igv_session.smk new file mode 100644 index 0000000..4da8932 --- /dev/null +++ b/workflow/rules/igv_session.smk @@ -0,0 +1,35 @@ +rule create_igv_input_file: + input: + get_igv_input() + output: + # "results/IGV/merged_igv_files.txt" + "results/IGV/igv_files.txt" + log: + "logs/igv/merged_igv_files.log" + shell: + "cat {input} > {output} 2> {log}" + +# remove paths for igv session download from report.zip +# rule igv_report_cleanup: +# input: +# "results/IGV/merged_igv_files.txt" +# output: +# "results/IGV/igv_files.txt" +# log: +# "logs/igv/igv_files.log" +# script: +# " ../scripts/igv_report_cleanup.py" + +rule igv_files_to_session: + input: + # igv_data=get_igv_data, + igv="results/IGV/igv_files.txt", + fasta="resources/ref/genome.fasta" + output: + report("results/IGV/igv_session.xml", category = "igv session") + params: + "--path_prefix '../../'" + log: + "logs/igv/igv_session_to_file.log" + shell: + " ../workflow/scripts/igv_files_to_session.py {output} {input.igv} ../../{input.fasta} {params} 2> {log}" diff --git a/workflow/rules/peak_analysis.smk b/workflow/rules/peak_analysis.smk index e21c3b3..fdb413e 100644 --- a/workflow/rules/peak_analysis.smk +++ b/workflow/rules/peak_analysis.smk @@ -52,9 +52,9 @@ rule macs2_callpeak_broad: "_treat_pileup.bdg", "_control_lambda.bdg", # these output extensions internally set the --broad option: - "_peaks.broadPeak", "_peaks.gappedPeak" ), + report("results/macs2_callpeak/{sample}-{control}.broad_peaks.broadPeak", category="accessory files") log: "logs/macs2/callpeak.{sample}-{control}.broad.log" @@ -82,9 +82,9 @@ rule macs2_callpeak_narrow: "_treat_pileup.bdg", "_control_lambda.bdg", # these output extensions internally set the --broad option: - "_peaks.narrowPeak", "_summits.bed" - ) + ), + report("results/macs2_callpeak/{sample}-{control}.narrow_peaks.narrowPeak", category="accessory files") log: "logs/macs2/callpeak.{sample}-{control}.narrow.log" params: @@ -182,7 +182,7 @@ rule create_igv_peaks: "logs/igv/create_igv_peaks/merged_library.{sample}-{control}.{peak}_peaks.log" shell: " find {input} -type f -name '*_peaks.{wildcards.peak}Peak' -exec " - "echo -e 'results/IGV/macs2_callpeak/{wildcards.peak}/\"{{}}\"\t0,0,178' \; > {output} 2> {log}" + "echo -e '{{}}\t0,0,178' \; > {output} 2> {log}" rule homer_annotatepeaks: input: diff --git a/workflow/rules/post-analysis.smk b/workflow/rules/post-analysis.smk index bb84f38..79d346e 100644 --- a/workflow/rules/post-analysis.smk +++ b/workflow/rules/post-analysis.smk @@ -102,7 +102,7 @@ rule bedGraphToBigWig: bedGraph="results/bed_graph/{sample}.sorted.bedgraph", chromsizes="resources/ref/genome.chrom.sizes" output: - "results/big_wig/{sample}.bigWig" + report("results/big_wig/{sample}.bigWig", category="accessory files") log: "logs/big_wig/{sample}.log" params: @@ -120,7 +120,7 @@ rule create_igv_bigwig: "logs/igv/create_igv_bigwig.log" shell: "find {input} -type f -name '*.bigWig' -exec " - "echo -e 'results/IGV/big_wig/\"{{}}\"\t0,0,178' \; > {output} 2> {log}" + "echo -e '{{}}\t0,0,178' \; > {output} 2> {log}" rule compute_matrix: input: diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index bd9a7df..dedc65b 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -16,7 +16,7 @@ rule multiqc: input: get_multiqc_input output: - "results/qc/multiqc/multiqc.html" + report("results/qc/multiqc/multiqc.html", category="MultiQC report") log: "logs/multiqc.log" wrapper: diff --git a/workflow/scripts/featurecounts_deseq2.R b/workflow/scripts/featurecounts_deseq2.R index 2c0766a..7244bcf 100644 --- a/workflow/scripts/featurecounts_deseq2.R +++ b/workflow/scripts/featurecounts_deseq2.R @@ -305,6 +305,8 @@ if (file.exists(ResultsFile) == FALSE) { # AVI: dynamically file name extensions added CompResultsFile <- paste(snakemake@output[["FDR_5_perc_res"]], "/", snakemake@params[["antibody"]], ".", CompPrefix, ".FDR_0.05_results.txt", sep="") CompBEDFile <- paste(snakemake@output[["FDR_5_perc_bed"]], "/", snakemake@params[["antibody"]], ".", CompPrefix, ".FDR_0.05_results.bed", sep="") + # AVI: write paths of FDR 0.05 bed files to igv file + cat(paste0(CompBEDFile, "\t255,0,0\n"),file=snakemake@output[["igv_FDR_5_bed"]],append=TRUE) MAplotFile <- paste(snakemake@output[["plot_FDR_5_perc_MA"]], "/", snakemake@params[["antibody"]], ".", CompPrefix, ".MA-plot_FDR_0.05.pdf", sep="") # AVI: added to create separate pdf files VolcanoPlotFile <- paste(snakemake@output[["plot_FDR_5_perc_volcano"]], "/", snakemake@params[["antibody"]], ".", CompPrefix, ".volcano_FDR_0.05.pdf", sep="") # AVI: added to create separate pdf files } diff --git a/workflow/scripts/igv_files_to_session.py b/workflow/scripts/igv_files_to_session.py new file mode 100755 index 0000000..54c118a --- /dev/null +++ b/workflow/scripts/igv_files_to_session.py @@ -0,0 +1,149 @@ +#!/usr/bin/env python + +### source: https://github.com/nf-core/chipseq/blob/master/bin/igv_files_to_session.py +### own changes and adjustments for snakemake-workflow chipseq are marked with "# AVI: " in comment + +#MIT License +# +#Copyright (c) Philip Ewels +# +#Permission is hereby granted, free of charge, to any person obtaining a copy +#of this software and associated documentation files (the "Software"), to deal +#in the Software without restriction, including without limitation the rights +#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +#copies of the Software, and to permit persons to whom the Software is +#furnished to do so, subject to the following conditions: +# +#The above copyright notice and this permission notice shall be included in all +#copies or substantial portions of the Software. +# +#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +#SOFTWARE. + + + +####################################################################### +####################################################################### +## Created on July 4th 2018 to create IGV session file from file list +####################################################################### +####################################################################### + +import os +import errno +import argparse + +############################################ +############################################ +## PARSE ARGUMENTS +############################################ +############################################ + +Description = 'Create IGV session file from a list of files and associated colours - ".bed", ".bw", ".bigwig", ".tdf", ".gtf" files currently supported.' +Epilog = """Example usage: python igv_files_to_session.py """ + +argParser = argparse.ArgumentParser(description=Description, epilog=Epilog) + +## REQUIRED PARAMETERS +argParser.add_argument('XML_OUT', help="XML output file.") +argParser.add_argument('LIST_FILE', help="Tab-delimited file containing two columns i.e. file_name\tcolour. Header isnt required.") +argParser.add_argument('GENOME', help="Full path to genome fasta file or shorthand for genome available in IGV e.g. hg19.") + +## OPTIONAL PARAMETERS +argParser.add_argument('-pp', '--path_prefix', type=str, dest="PATH_PREFIX", default='', help="Path prefix to be added at beginning of all files in input list file.") +args = argParser.parse_args() + +############################################ +############################################ +## HELPER FUNCTIONS +############################################ +############################################ + +def makedir(path): + + if not len(path) == 0: + try: + os.makedirs(path) + except OSError as exception: + if exception.errno != errno.EEXIST: + raise + +############################################ +############################################ +## MAIN FUNCTION +############################################ +############################################ + +def igv_files_to_session(XMLOut,ListFile,Genome,PathPrefix=''): + + makedir(os.path.dirname(XMLOut)) + + fileList = [] + fin = open(ListFile,'r') + while True: + line = fin.readline() + if line: + ifile,colour = line.strip().split('\t') + if len(colour.strip()) == 0: + colour = '0,0,178' + fileList.append((PathPrefix.strip()+ifile,colour)) + else: + break + fout.close() + + ## ADD RESOURCES SECTION + XMLStr = '\n' + XMLStr += '\n' % (Genome) + XMLStr += '\t\n' + for ifile,colour in fileList: + XMLStr += '\t\t\n' % (ifile) + XMLStr += '\t\n' + + ## ADD PANEL SECTION + XMLStr += '\t\n' + for ifile,colour in fileList: + extension = os.path.splitext(ifile)[1].lower() + if extension in ['.bed','.broadpeak','.narrowpeak']: + XMLStr += '\t\t Date: Thu, 24 Jun 2021 01:17:40 +0200 Subject: [PATCH 05/40] igv session download from report --- workflow/report/igv_session.rst | 7 +++ workflow/rules/common.smk | 41 ++++++++++--- workflow/rules/consensus_peak_analysis.smk | 9 +-- workflow/rules/igv_session.smk | 69 +++++++++++++++++----- workflow/rules/peak_analysis.smk | 9 ++- workflow/rules/post-analysis.smk | 2 +- workflow/scripts/igv_report_cleanup.py | 11 +++- 7 files changed, 110 insertions(+), 38 deletions(-) create mode 100644 workflow/report/igv_session.rst diff --git a/workflow/report/igv_session.rst b/workflow/report/igv_session.rst new file mode 100644 index 0000000..d95a3ff --- /dev/null +++ b/workflow/report/igv_session.rst @@ -0,0 +1,7 @@ +The IGV session can be downloaded here. It contains all files necessary for the session. +Unzip the directory 'report_igv_session.zip'. +`Install IGV `_ and open session by selecting +'report_igv_session.xml'. + +Alternatively you can also start the IGV session directly from the results of the workflow. +The session file is located in 'results/IGV/igv_session.xml'. diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 3971c08..4c19821 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -161,8 +161,8 @@ def get_read_group(wildcards): unit=wildcards.unit, platform=units.loc[(wildcards.sample, wildcards.unit), "platform"]) -# def get_deseq2_igv_input(): -# return [file for file in os.listdir("results/deseq2/FDR/bed_files/FDR_0.05_{antibody}.consensus_{peak}-peaks")] +def get_unique_antibodies(): + return set([antibody for antibody in samples["antibody"] if not pd.isnull(antibody)]) def get_igv_input(): igv_input = [] @@ -179,8 +179,8 @@ def get_igv_input(): peak=config["params"]["peak-analysis"] ) ) - unique_antibodies = set([antibody for antibody in samples["antibody"] if not pd.isnull(antibody)]) - for antibody in unique_antibodies: + + for antibody in get_unique_antibodies(): igv_input.extend( expand( [ @@ -193,6 +193,31 @@ def get_igv_input(): ) return igv_input +def get_files_for_igv(): + igv_files = [] + for sample in samples.index: + igv_files.extend( + expand( + [ + "results/big_wig/{sample}.bigWig" + ], + sample=sample + ) + ) + + igv_files.extend( + expand( + [ + "results/macs2_callpeak/{sample_control_peak}_peaks.{peak}Peak", + "results/macs2_merged_expand/{antibody}.consensus_{peak}-peaks.boolean.bed" + ], + sample_control_peak=get_sample_control_peak_combinations_list(), + peak=config["params"]["peak-analysis"], + antibody=get_unique_antibodies() + ) + ) + return igv_files + def get_multiqc_input(wildcards): multiqc_input = [] for (sample, unit) in units.index: @@ -312,7 +337,8 @@ def all_input(wildcards): # QC with fastQC and multiQC, igv session wanted_input.extend([ "results/qc/multiqc/multiqc.html", - "results/IGV/igv_session.xml" + "results/IGV/igv_session.xml", + "results/IGV/report_igv_session.zip" ]) # trimming reads @@ -345,7 +371,6 @@ def all_input(wildcards): wanted_input.extend( expand ( [ - # "results/IGV/big_wig/merged_library.bigWig.igv.txt", "results/phantompeakqualtools/{sample}.phantompeak.pdf" ], sample = sample @@ -396,8 +421,7 @@ def all_input(wildcards): expand( [ "results/macs2_merged_expand/{antibody}.consensus_{peak}-peaks.boolean.saf", - "results/macs2_merged_expand/plots/{antibody}.consensus_{peak}-peaks.boolean.intersect.plot.pdf", - # "results/IGV/consensus/merged_library.{antibody}.consensus_{peak}-peaks.igv.txt" + "results/macs2_merged_expand/plots/{antibody}.consensus_{peak}-peaks.boolean.intersect.plot.pdf" ], peak = config["params"]["peak-analysis"], antibody = antibody @@ -443,7 +467,6 @@ def all_input(wildcards): "results/macs2_callpeak/{sample}-{control}.{peak}_treat_pileup.bdg", "results/macs2_callpeak/{sample}-{control}.{peak}_control_lambda.bdg", "results/macs2_callpeak/{sample}-{control}.{peak}_peaks.{peak}Peak", - # "results/IGV/macs2_callpeak-{peak}/merged_library.{sample}-{control}.{peak}_peaks.igv.txt", "results/macs2_callpeak/plots/plot_{peak}_peaks_count.pdf", "results/macs2_callpeak/plots/plot_{peak}_peaks_frip_score.pdf", "results/macs2_callpeak/plots/plot_{peak}_peaks_macs2.pdf" diff --git a/workflow/rules/consensus_peak_analysis.smk b/workflow/rules/consensus_peak_analysis.smk index ce34a5f..935e12a 100644 --- a/workflow/rules/consensus_peak_analysis.smk +++ b/workflow/rules/consensus_peak_analysis.smk @@ -47,7 +47,7 @@ rule create_consensus_bed: input: "results/macs2_merged_expand/{antibody}.consensus_{peak}-peaks.boolean.txt" output: - report("results/macs2_merged_expand/{antibody}.consensus_{peak}-peaks.boolean.bed", category="accessory files") + "results/macs2_merged_expand/{antibody}.consensus_{peak}-peaks.boolean.bed" conda: "../envs/gawk.yaml" log: @@ -94,8 +94,6 @@ rule create_consensus_igv: shell: "find {input} -type f -name '*.consensus_{wildcards.peak}-peaks.boolean.bed' -exec " "echo -e '{{}}\t0,0,0' \; > {output} 2> {log}" - # "find {input} -type f -name '*.consensus_{wildcards.peak}-peaks.boolean.bed' -exec " - # "echo -e 'results/IGV/consensus/{wildcards.antibody}/\"{{}}\"\t0,0,0' \; > {output} 2> {log}" rule homer_consensus_annotatepeaks: input: @@ -192,10 +190,7 @@ rule featurecounts_deseq2: FDR_1_perc_res=directory("results/deseq2/FDR/results/FDR_0.01_{antibody}.consensus_{peak}-peaks"), FDR_5_perc_res=directory("results/deseq2/FDR/results/FDR_0.05_{antibody}.consensus_{peak}-peaks"), FDR_1_perc_bed=directory("results/deseq2/FDR/bed_files/FDR_0.01_{antibody}.consensus_{peak}-peaks"), - FDR_5_perc_bed=report( - directory("results/deseq2/FDR/bed_files/FDR_0.05_{antibody}.consensus_{peak}-peaks"), - patterns=["{antibody}.{{group_1_vs_group_2}}.FDR_0.05_results.bed"], - category="accessory files"), + FDR_5_perc_bed=directory("results/deseq2/FDR/bed_files/FDR_0.05_{antibody}.consensus_{peak}-peaks"), igv_FDR_5_bed="results/IGV/consensus/merged_library.{antibody}.consensus_{peak}-peaks.deseq2.FDR_0.05.igv.txt", plot_FDR_1_perc_MA=report( directory("results/deseq2/comparison_plots/MA_plots/FDR_0.01_{antibody}consensus_{peak}-peaks"), diff --git a/workflow/rules/igv_session.smk b/workflow/rules/igv_session.smk index 4da8932..1f7c0ac 100644 --- a/workflow/rules/igv_session.smk +++ b/workflow/rules/igv_session.smk @@ -2,34 +2,73 @@ rule create_igv_input_file: input: get_igv_input() output: - # "results/IGV/merged_igv_files.txt" - "results/IGV/igv_files.txt" + temp("results/IGV/igv_files.txt") log: "logs/igv/merged_igv_files.log" shell: "cat {input} > {output} 2> {log}" -# remove paths for igv session download from report.zip -# rule igv_report_cleanup: -# input: -# "results/IGV/merged_igv_files.txt" -# output: -# "results/IGV/igv_files.txt" -# log: -# "logs/igv/igv_files.log" -# script: -# " ../scripts/igv_report_cleanup.py" - +# igv session that can be started directly from the generated files of workflow rule igv_files_to_session: input: - # igv_data=get_igv_data, igv="results/IGV/igv_files.txt", fasta="resources/ref/genome.fasta" output: - report("results/IGV/igv_session.xml", category = "igv session") + "results/IGV/igv_session.xml" params: "--path_prefix '../../'" log: "logs/igv/igv_session_to_file.log" shell: " ../workflow/scripts/igv_files_to_session.py {output} {input.igv} ../../{input.fasta} {params} 2> {log}" + +# remove paths for igv session to download from report.zip +rule igv_report_cleanup: + input: + "results/IGV/igv_files.txt" + output: + temp("results/IGV/report_igv_files.txt") + log: + "logs/igv/igv_files.log" + script: + "../scripts/igv_report_cleanup.py" + +rule igv_files_to_report: + input: + igv="results/IGV/report_igv_files.txt", + fasta="resources/ref/genome.fasta" + output: + temp("results/IGV/report_igv_session.xml") + params: + "" + log: + "logs/igv/igv_files_to_report.log" + shell: + " ../workflow/scripts/igv_files_to_session.py {output} {input.igv} $(basename {input.fasta}) {params} 2> {log}" + +rule collect_igv_report_session_files: + input: + igv_data=get_files_for_igv(), + deseq2_files=directory(expand("results/deseq2/FDR/bed_files/FDR_0.05_{antibody}.consensus_{peak}-peaks", + antibody=get_unique_antibodies(), peak=config["params"]["peak-analysis"])), + fasta="resources/ref/genome.fasta", + igv_session="results/IGV/report_igv_session.xml" + output: + temp(directory("results/IGV/report_igv_session")) + log: + "logs/igv/collect_igv_report_session_files.log" + shell: + "mkdir -p {output}; cp {input.igv_data} {output}/; " + "cp -R {input.deseq2_files}/* {output}/; cp {input.fasta} {output}/; " + "cp {input.igv_session} {output}/" + +# igv session that can be downloaded from generated report +rule zip_igv_report_session: + input: + directory("results/IGV/report_igv_session") + output: + report("results/IGV/report_igv_session.zip", caption = "../report/igv_session.rst", category="IGV session") + log: + "logs/igv/collect_igv_report_session_files.log" + shell: + "cd $(dirname {input}); zip $(basename {output}) $(basename {input})/*" diff --git a/workflow/rules/peak_analysis.smk b/workflow/rules/peak_analysis.smk index fdb413e..1ebfb96 100644 --- a/workflow/rules/peak_analysis.smk +++ b/workflow/rules/peak_analysis.smk @@ -52,10 +52,9 @@ rule macs2_callpeak_broad: "_treat_pileup.bdg", "_control_lambda.bdg", # these output extensions internally set the --broad option: + "_peaks.broadPeak", "_peaks.gappedPeak" - ), - report("results/macs2_callpeak/{sample}-{control}.broad_peaks.broadPeak", category="accessory files") - + ) log: "logs/macs2/callpeak.{sample}-{control}.broad.log" params: @@ -82,9 +81,9 @@ rule macs2_callpeak_narrow: "_treat_pileup.bdg", "_control_lambda.bdg", # these output extensions internally set the --broad option: + "_peaks.narrowPeak", "_summits.bed" - ), - report("results/macs2_callpeak/{sample}-{control}.narrow_peaks.narrowPeak", category="accessory files") + ) log: "logs/macs2/callpeak.{sample}-{control}.narrow.log" params: diff --git a/workflow/rules/post-analysis.smk b/workflow/rules/post-analysis.smk index 79d346e..61fe5cd 100644 --- a/workflow/rules/post-analysis.smk +++ b/workflow/rules/post-analysis.smk @@ -102,7 +102,7 @@ rule bedGraphToBigWig: bedGraph="results/bed_graph/{sample}.sorted.bedgraph", chromsizes="resources/ref/genome.chrom.sizes" output: - report("results/big_wig/{sample}.bigWig", category="accessory files") + "results/big_wig/{sample}.bigWig" log: "logs/big_wig/{sample}.log" params: diff --git a/workflow/scripts/igv_report_cleanup.py b/workflow/scripts/igv_report_cleanup.py index 4b4d036..d9eb2dc 100644 --- a/workflow/scripts/igv_report_cleanup.py +++ b/workflow/scripts/igv_report_cleanup.py @@ -1,3 +1,12 @@ +from smart_open import open +import os + log = snakemake.log_fmt_shell(stdout=False, stderr=True) -merged_igv = snakemake.input[0] +with open(snakemake.input[0]) as fin: + with open(snakemake.output[0], 'w') as fout: + for line in fin: + items = line.split("\t") + items[0] = os.path.basename(items[0]) + line = "{path}\t{col}".format(path=items[0], col=items[1]) + fout.write(line) From ef5f66c8113c3213a6af20067ab27163dd815687 Mon Sep 17 00:00:00 2001 From: AntonieV Date: Thu, 24 Jun 2021 15:21:26 +0200 Subject: [PATCH 06/40] some minor changes on report descriptions --- workflow/report/multiqc_report.rst | 3 +++ workflow/report/plot_annotatepeaks_homer.rst | 5 ++--- .../report/plot_base_distribution_by_cycle_picard_mm.rst | 4 ++-- workflow/report/plot_deseq2_FDR_1_perc_MA.rst | 2 +- workflow/report/plot_deseq2_FDR_5_perc_MA.rst | 2 +- workflow/report/plot_deseq2_heatmap.rst | 5 ++--- workflow/report/plot_deseq2_pca.rst | 7 +++---- workflow/report/plot_deseq2_sample_corr_heatmap.rst | 5 ++--- workflow/report/plot_deseq2_scatter.rst | 3 +-- workflow/report/plot_fingerprint_deeptools.rst | 4 ++-- workflow/report/plot_heatmap_deeptools.rst | 2 +- workflow/report/plot_insert_size_histogram_picard_mm.rst | 4 ++-- workflow/report/plot_macs2_qc.rst | 2 +- workflow/report/plot_phantompeak_phantompeakqualtools.rst | 4 ++-- workflow/report/plot_profile_deeptools.rst | 2 +- workflow/report/plot_quality_by_cycle_picard_mm.rst | 4 ++-- workflow/report/plot_quality_distribution_picard_mm.rst | 4 ++-- workflow/rules/consensus_peak_analysis.smk | 8 ++++---- workflow/rules/qc.smk | 2 +- 19 files changed, 35 insertions(+), 37 deletions(-) create mode 100644 workflow/report/multiqc_report.rst diff --git a/workflow/report/multiqc_report.rst b/workflow/report/multiqc_report.rst new file mode 100644 index 0000000..5a14a9b --- /dev/null +++ b/workflow/report/multiqc_report.rst @@ -0,0 +1,3 @@ +**MultiQC** report is a collection of multiple plots and stats from Chip-seq processing pipeline (phantompeakqualtools), +Preseq analysis, metrics from Picard and Samtools. There are also quality control metrics and plots from FastQC analysis. +Detailed descriptions of the individual plots and statistics can be found in MultiQC report. diff --git a/workflow/report/plot_annotatepeaks_homer.rst b/workflow/report/plot_annotatepeaks_homer.rst index e835d23..1ed0e62 100644 --- a/workflow/report/plot_annotatepeaks_homer.rst +++ b/workflow/report/plot_annotatepeaks_homer.rst @@ -1,4 +1,3 @@ -**`Homer annotatePeaks `_ ** is used to annotate the peaks relative +`Homer annotatePeaks `_ is used to annotate the peaks relative to known genomic features. The plots of this analysis show for all samples and their associated controls peak location -relative to annotation, percentage of unique genes to closest peak an peak distribution relative to TSS -(Transcription Start Site). +relative to annotation, percentage of unique genes to closest peak an peak distribution relative to TSS (Transcription Start Site). diff --git a/workflow/report/plot_base_distribution_by_cycle_picard_mm.rst b/workflow/report/plot_base_distribution_by_cycle_picard_mm.rst index 3631d8f..016bd55 100644 --- a/workflow/report/plot_base_distribution_by_cycle_picard_mm.rst +++ b/workflow/report/plot_base_distribution_by_cycle_picard_mm.rst @@ -1,5 +1,5 @@ -**`Base distribution by cycle plot -`_ (Picard)** is +`Base distribution by cycle plot +`_ **(Picard)** is used as quality control for alignment-level and shows the nucleotide distribution per cycle of the bam files after filtering, sorting, merging and removing orphans. For any cycle within reads the relative proportions of nucleotides should reflect the AT:CG content. For all nucleotides flattish lines would be expected and any spikes would suggest a diff --git a/workflow/report/plot_deseq2_FDR_1_perc_MA.rst b/workflow/report/plot_deseq2_FDR_1_perc_MA.rst index 2c83f01..6f3b891 100644 --- a/workflow/report/plot_deseq2_FDR_1_perc_MA.rst +++ b/workflow/report/plot_deseq2_FDR_1_perc_MA.rst @@ -1,4 +1,4 @@ -**`MA plot `_ (FDR 0.01)** +`MA plot `_ **(FDR 0.01)** shows the log2 fold changes versus the mean of normalized counts from `DESeq2 `_ analysis filtered on a false discovery rate (FDR) threshold of 0.01 for pairwise comparisons of samples across the groups from a particular antibody. diff --git a/workflow/report/plot_deseq2_FDR_5_perc_MA.rst b/workflow/report/plot_deseq2_FDR_5_perc_MA.rst index 06aee7f..dca55a6 100644 --- a/workflow/report/plot_deseq2_FDR_5_perc_MA.rst +++ b/workflow/report/plot_deseq2_FDR_5_perc_MA.rst @@ -1,4 +1,4 @@ -**`MA plot `_ (FDR 0.05)** +`MA plot `_ **(FDR 0.05)** shows the log2 fold changes versus the mean of normalized counts from `DESeq2 `_ analysis filtered on a false discovery rate (FDR) threshold of 0.05 for pairwise comparisons of samples across the groups from a particular antibody. diff --git a/workflow/report/plot_deseq2_heatmap.rst b/workflow/report/plot_deseq2_heatmap.rst index 1d3397c..599c234 100644 --- a/workflow/report/plot_deseq2_heatmap.rst +++ b/workflow/report/plot_deseq2_heatmap.rst @@ -1,7 +1,6 @@ -**`Heatmap plot `_** shows for each antibody a heatmap of +`Heatmap plot `_ shows for each antibody a heatmap of the `rlog `_ or `vst `_ transformed counts from -`DESeq2 `_ analysis. For more -information about DESeq2 please see +DESeq2 analysis. For more information about DESeq2 please see `documentation `_. diff --git a/workflow/report/plot_deseq2_pca.rst b/workflow/report/plot_deseq2_pca.rst index 581af78..db724a3 100644 --- a/workflow/report/plot_deseq2_pca.rst +++ b/workflow/report/plot_deseq2_pca.rst @@ -1,7 +1,6 @@ -**`PCA plot `_ -(Principal Component Analysis)** describes variance of the +`PCA plot `_ +**(Principal Component Analysis)** describes variance of the `rlog `_ or `vst `_ transformed counts from -`DESeq2 `_ analysis with regard to -the antibody used. For more information about DESeq2 please see +DESeq2 analysis with regard to the antibody used. For more information about DESeq2 please see `documentation `_. diff --git a/workflow/report/plot_deseq2_sample_corr_heatmap.rst b/workflow/report/plot_deseq2_sample_corr_heatmap.rst index e23f22c..7f91af9 100644 --- a/workflow/report/plot_deseq2_sample_corr_heatmap.rst +++ b/workflow/report/plot_deseq2_sample_corr_heatmap.rst @@ -1,6 +1,5 @@ -**Correlation `heatmap plots `_** shows heatmaps of the +Correlation `heatmap plots `_ shows heatmaps of the `rlog `_ or `vst `_ transformed counts from -`DESeq2 `_ analysis for each -pairwise comparison of samples across the groups from a particular antibody. For more information about DESeq2 please +DESeq2 analysis for each pairwise comparison of samples across the groups from a particular antibody. For more information about DESeq2 please see `documentation `_. diff --git a/workflow/report/plot_deseq2_scatter.rst b/workflow/report/plot_deseq2_scatter.rst index 4071ea8..6a8cc6f 100644 --- a/workflow/report/plot_deseq2_scatter.rst +++ b/workflow/report/plot_deseq2_scatter.rst @@ -1,6 +1,5 @@ **Scatter plots** uses the matrix of the `rlog `_ or `vst `_ transformed counts from -`DESeq2 `_ analysis for -pairwise comparisons of samples across the groups from a particular antibody. For more information about DESeq2 please see +DESeq2 analysis for pairwise comparisons of samples across the groups from a particular antibody. For more information about DESeq2 please see `documentation `_. diff --git a/workflow/report/plot_fingerprint_deeptools.rst b/workflow/report/plot_fingerprint_deeptools.rst index 9874c98..fecf21e 100644 --- a/workflow/report/plot_fingerprint_deeptools.rst +++ b/workflow/report/plot_fingerprint_deeptools.rst @@ -1,9 +1,9 @@ -**deepTools `plotFingerprint `_** is a +**deepTools** `plotFingerprint `_ is a quality control tool, which determines how well the signal in the ChIP-seq sample can be differentiated from the background distribution of reads in the control sample. plotFingerprint randomly samples genome regions of a specified length and sums the per-base coverage that overlap with those regions. These values are then sorted according to their rank and the cumulative sum of read counts is plotted. With a perfect uniform distribution of reads along the genome and infinite sequencing coverage a straight diagonal line is shown in the plot. For more information on the interpretation of -the plots please see `here `_. +the plots please see `documentation `_. For more information about plotFingerprint metrics see `here `_. diff --git a/workflow/report/plot_heatmap_deeptools.rst b/workflow/report/plot_heatmap_deeptools.rst index aa76279..68b871e 100644 --- a/workflow/report/plot_heatmap_deeptools.rst +++ b/workflow/report/plot_heatmap_deeptools.rst @@ -1,4 +1,4 @@ -**deepTools `plotHeatmap `_** creates a +**deepTools** `plotHeatmap `_ creates a heatmap for scores over sets of genomic regions and gives a visualisation for the genome-wide enrichment of the samples. The `scores `_ represent the signal over a set of regions where all regions are scaled to the same size. diff --git a/workflow/report/plot_insert_size_histogram_picard_mm.rst b/workflow/report/plot_insert_size_histogram_picard_mm.rst index 348a398..05edb0d 100644 --- a/workflow/report/plot_insert_size_histogram_picard_mm.rst +++ b/workflow/report/plot_insert_size_histogram_picard_mm.rst @@ -1,5 +1,5 @@ -**`Insert size histogram -`_ (Picard)** is used +`Insert size histogram +`_ **(Picard)** is used for validating paired-end library construction. It shows the insert size distribution versus fractions of read pairs in each of the three orientations (FR, RF, and TANDEM) as a histogram. For more information about `collected Picard metrics `_ please diff --git a/workflow/report/plot_macs2_qc.rst b/workflow/report/plot_macs2_qc.rst index 9c2ba8c..bb8a330 100644 --- a/workflow/report/plot_macs2_qc.rst +++ b/workflow/report/plot_macs2_qc.rst @@ -1,4 +1,4 @@ -**`MACS2 `_ callpeak quality control plots** show for all +`MACS2 `_ **callpeak quality control plots** show for all samples and their associated controls the number of peaks and their distributions of peak length, fold-change, FDR and p-value from the results of the `MACS callpeak analysis `_. diff --git a/workflow/report/plot_phantompeak_phantompeakqualtools.rst b/workflow/report/plot_phantompeak_phantompeakqualtools.rst index 232ef68..85da65e 100644 --- a/workflow/report/plot_phantompeak_phantompeakqualtools.rst +++ b/workflow/report/plot_phantompeak_phantompeakqualtools.rst @@ -1,10 +1,10 @@ - **`Phantompeakqualtools plot `_** shows strand-shift versus +`Phantompeakqualtools plot `_ shows strand-shift versus cross-correlation and computes informative enrichment and quality measures for ChIP-seq data. It also calculates the relative (RSC) and the normalized strand cross-correlation coefficient (NSC). Datasets with NSC values much less than 1.1 tend to have low signal to noise or few peaks. This may indicate poor quality or only few binding sites. RSC values significantly lower than 1 (< 0.8) tend to have low signal to noise. The low scores can be due to several factors and are often due to failed and poor quality ChIP or low read sequence quality. In section "ChIP-seq processing pipeline" of MultiQC report are additional plots across all samples for RSC, NSC and strand-shift cross-correlation. - For more information about Phantompeakqualtools please +For more information about Phantompeakqualtools please see `documentation `_. For more information about interpretation see published `article `_. diff --git a/workflow/report/plot_profile_deeptools.rst b/workflow/report/plot_profile_deeptools.rst index 5cd87b3..76ccd02 100644 --- a/workflow/report/plot_profile_deeptools.rst +++ b/workflow/report/plot_profile_deeptools.rst @@ -1,4 +1,4 @@ -**deepTools `plotProfile `_** creates a +**deepTools** `plotProfile `_ creates a profile plot for scores over sets of genomic regions and gives a visualisation for the genome-wide enrichment of the samples. The `scores `_ represent the signal over a set of regions where all regions are scaled to the same size. diff --git a/workflow/report/plot_quality_by_cycle_picard_mm.rst b/workflow/report/plot_quality_by_cycle_picard_mm.rst index 5219fe1..b8b8a61 100644 --- a/workflow/report/plot_quality_by_cycle_picard_mm.rst +++ b/workflow/report/plot_quality_by_cycle_picard_mm.rst @@ -1,5 +1,5 @@ -**`Mean quality by cycle plot -`_ (Picard)** is used as +`Mean quality by cycle plot +`_ **(Picard)** is used as quality control for sequencing machine performance and collects the mean quality by cycle of the bam files after filtering, sorting, merging and removing orphans. Any spikes in quality within reads may indicate technical problems during sequencing. For more information about `collected Picard metrics diff --git a/workflow/report/plot_quality_distribution_picard_mm.rst b/workflow/report/plot_quality_distribution_picard_mm.rst index 4793171..bd181ef 100644 --- a/workflow/report/plot_quality_distribution_picard_mm.rst +++ b/workflow/report/plot_quality_distribution_picard_mm.rst @@ -1,5 +1,5 @@ -**`Quality score distribution plot -`_ (Picard)** is used +`Quality score distribution plot +`_ **(Picard)** is used as overall quality control for a library in a given run. It shows for the range of quality scores the corresponding total numbers of bases. For more information about `collected Picard metrics `_ please diff --git a/workflow/rules/consensus_peak_analysis.smk b/workflow/rules/consensus_peak_analysis.smk index 935e12a..6a7057f 100644 --- a/workflow/rules/consensus_peak_analysis.smk +++ b/workflow/rules/consensus_peak_analysis.smk @@ -196,22 +196,22 @@ rule featurecounts_deseq2: directory("results/deseq2/comparison_plots/MA_plots/FDR_0.01_{antibody}consensus_{peak}-peaks"), patterns=["{antibody}.{{group_1_vs_group_2}}.MA-plot_FDR_0.01.pdf"], caption = "../report/plot_deseq2_FDR_1_perc_MA.rst", - category = "DESeq2-FDR"), + category = "DESeq2"), plot_FDR_5_perc_MA=report( directory("results/deseq2/comparison_plots/MA_plots/FDR_0.05_{antibody}consensus_{peak}-peaks"), patterns=["{antibody}.{{group_1_vs_group_2}.MA-plot_FDR_0.05.pdf"], caption = "../report/plot_deseq2_FDR_5_perc_MA.rst", - category = "DESeq2-FDR"), + category = "DESeq2"), plot_FDR_1_perc_volcano=report( directory("results/deseq2/comparison_plots/volcano_plots/FDR_0.01_{antibody}consensus_{peak}-peaks"), patterns=["{antibody}.{{group_1_vs_group_2}}.volcano-plot_FDR_0.01.pdf"], caption = "../report/plot_deseq2_FDR_1_perc_volcano.rst", - category = "DESeq2-FDR"), + category = "DESeq2"), plot_FDR_5_perc_volcano=report( directory("results/deseq2/comparison_plots/volcano_plots/FDR_0.05_{antibody}consensus_{peak}-peaks"), patterns=["{antibody}.{{group_1_vs_group_2}}.volcano-plot_FDR_0.05.pdf"], caption = "../report/plot_deseq2_FDR_5_perc_volcano.rst", - category = "DESeq2-FDR"), + category = "DESeq2"), plot_sample_corr_heatmap=report( directory("results/deseq2/comparison_plots/correlation_heatmaps_{antibody}consensus_{peak}-peaks"), patterns=["{antibody}.{{group_1_vs_group_2}}.correlation_heatmap.pdf"], diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index dedc65b..5314466 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -16,7 +16,7 @@ rule multiqc: input: get_multiqc_input output: - report("results/qc/multiqc/multiqc.html", category="MultiQC report") + report("results/qc/multiqc/multiqc.html", caption="../report/multiqc_report.rst", category="QC") log: "logs/multiqc.log" wrapper: From 6cf223cd6b4aef481edb487531a61d57832482e5 Mon Sep 17 00:00:00 2001 From: AntonieV Date: Thu, 24 Jun 2021 18:30:57 +0200 Subject: [PATCH 07/40] env for zip added and some other minor changes --- workflow/envs/gzip.yaml | 5 +++++ workflow/rules/igv_session.smk | 4 +++- 2 files changed, 8 insertions(+), 1 deletion(-) create mode 100644 workflow/envs/gzip.yaml diff --git a/workflow/envs/gzip.yaml b/workflow/envs/gzip.yaml new file mode 100644 index 0000000..fafb424 --- /dev/null +++ b/workflow/envs/gzip.yaml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - defaults +dependencies: + - gzip ==1.10 diff --git a/workflow/rules/igv_session.smk b/workflow/rules/igv_session.smk index 1f7c0ac..af42fd4 100644 --- a/workflow/rules/igv_session.smk +++ b/workflow/rules/igv_session.smk @@ -65,10 +65,12 @@ rule collect_igv_report_session_files: # igv session that can be downloaded from generated report rule zip_igv_report_session: input: - directory("results/IGV/report_igv_session") + rules.collect_igv_report_session_files.output output: report("results/IGV/report_igv_session.zip", caption = "../report/igv_session.rst", category="IGV session") log: "logs/igv/collect_igv_report_session_files.log" + conda: + "../envs/gzip.yaml" shell: "cd $(dirname {input}); zip $(basename {output}) $(basename {input})/*" From 90b21c3009c62a137a5d5bac0cfa85961015be38 Mon Sep 17 00:00:00 2001 From: AntonieV Date: Thu, 24 Jun 2021 18:45:09 +0200 Subject: [PATCH 08/40] changes on zip env --- workflow/envs/{gzip.yaml => zip.yaml} | 2 +- workflow/rules/igv_session.smk | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) rename workflow/envs/{gzip.yaml => zip.yaml} (76%) diff --git a/workflow/envs/gzip.yaml b/workflow/envs/zip.yaml similarity index 76% rename from workflow/envs/gzip.yaml rename to workflow/envs/zip.yaml index fafb424..9baef16 100644 --- a/workflow/envs/gzip.yaml +++ b/workflow/envs/zip.yaml @@ -2,4 +2,4 @@ channels: - conda-forge - defaults dependencies: - - gzip ==1.10 + - zip ==3.0 diff --git a/workflow/rules/igv_session.smk b/workflow/rules/igv_session.smk index af42fd4..6670ac2 100644 --- a/workflow/rules/igv_session.smk +++ b/workflow/rules/igv_session.smk @@ -71,6 +71,6 @@ rule zip_igv_report_session: log: "logs/igv/collect_igv_report_session_files.log" conda: - "../envs/gzip.yaml" + "../envs/zip.yaml" shell: "cd $(dirname {input}); zip $(basename {output}) $(basename {input})/*" From bd56726ec726314969813371a8079ffb17f60226 Mon Sep 17 00:00:00 2001 From: AntonieV Date: Wed, 30 Jun 2021 00:13:40 +0200 Subject: [PATCH 09/40] Changes according to PR#8 --- .test/config/config.yaml | 8 +- .test/config/samples.tsv | 53 ++++++------ .test/config/units.tsv | 82 +++++++++---------- .test/config_paired_end/config.yaml | 8 +- .test/config_paired_end/units.tsv | 14 ++-- .test/config_paired_end_reduced/config.yaml | 8 +- .test/config_paired_end_reduced/units.tsv | 12 +-- .test/config_single_end/config.yaml | 8 +- .test/config_single_end/units.tsv | 14 ++-- .test/config_single_end_reduced/config.yaml | 8 +- .test/config_single_end_reduced/units.tsv | 14 ++-- config/README.md | 2 +- config/config.yaml | 10 +-- config/units.tsv | 2 +- workflow/report/multiqc_report.rst | 5 +- workflow/report/plot_annotatepeaks_homer.rst | 5 +- ...t_base_distribution_by_cycle_picard_mm.rst | 14 ++-- .../report/plot_consensus_peak_intersect.rst | 2 +- workflow/report/plot_deseq2_FDR_1_perc_MA.rst | 11 +-- .../report/plot_deseq2_FDR_1_perc_volcano.rst | 10 ++- workflow/report/plot_deseq2_FDR_5_perc_MA.rst | 12 +-- .../report/plot_deseq2_FDR_5_perc_volcano.rst | 9 +- workflow/report/plot_deseq2_heatmap.rst | 6 +- workflow/report/plot_deseq2_pca.rst | 2 +- .../plot_deseq2_sample_corr_heatmap.rst | 4 +- workflow/report/plot_deseq2_scatter.rst | 7 +- workflow/report/plot_heatmap_deeptools.rst | 4 +- workflow/report/plot_macs2_qc.rst | 5 +- workflow/report/plot_profile_deeptools.rst | 5 +- .../plot_quality_by_cycle_picard_mm.rst | 2 +- workflow/rules/consensus_peak_analysis.smk | 15 ++-- workflow/rules/peak_analysis.smk | 2 +- workflow/rules/qc.smk | 2 +- workflow/scripts/featurecounts_deseq2.R | 22 ++--- 34 files changed, 195 insertions(+), 192 deletions(-) diff --git a/.test/config/config.yaml b/.test/config/config.yaml index 14c186c..b521887 100644 --- a/.test/config/config.yaml +++ b/.test/config/config.yaml @@ -1,4 +1,4 @@ -# This file should contain everything to configure the workflow on a global scale. +# This file should contains everything to configure the workflow on a global scale. # In case of sample based data, it should be complemented by a samples.tsv file that contains # one row per sample. It can be parsed easily via pandas. samples: "config/samples.tsv" @@ -20,7 +20,7 @@ resources: release: 101 # Genome build build: GRCh38 - # for testing data a specific chromosome can be selected + # for testing data a single chromosome can be selected (leave empty for a regular analysis) chromosome: # specify release version number of igenomes list to use (see https://github.com/nf-core/chipseq/releases), e.g. 1.2.2 igenomes_release: 1.2.2 @@ -53,7 +53,7 @@ params: activate: True consensus-peak-analysis: activate: True - # samtools view parameters: + # samtools view parameter suggestions (for full parameters, see: https://www.htslib.org/doc/samtools-view.html): # if duplicates should be removed in this filtering, add "-F 0x0400" to the params # if for each read, you only want to retain a single (best) mapping, add "-q 1" to params # if you would like to restrict analysis to certain regions (e.g. excluding other "blacklisted" regions), @@ -63,7 +63,7 @@ params: samtools-view-se: "-b -F 0x004" samtools-view-pe: "-b -F 0x004 -G 0x009 -f 0x001" plotfingerprint: - # Number of bins that sampled from the genome, for which the overlapping number of reads is computed for fingerprint plot + # --numberOfSamples parameter of deeptools plotFingerprint, see: https://deeptools.readthedocs.io/en/develop/content/tools/plotFingerprint.html#Optional%20arguments number-of-samples: 500000 # optional parameters for picard's CollectMultipleMetrics from sorted, filtered and merged bam files in post analysis step # see https://gatk.broadinstitute.org/hc/en-us/articles/360037594031-CollectMultipleMetrics-Picard- diff --git a/.test/config/samples.tsv b/.test/config/samples.tsv index 3bb0b81..d01ea7d 100644 --- a/.test/config/samples.tsv +++ b/.test/config/samples.tsv @@ -6,36 +6,35 @@ D E2 batch2 AJ ERa E TNFa batch1 AK ERa F TNFa batch2 AL ERa G E2_TNFa batch1 AM ERa -H E2_TNFa batch2 AN ERa -I Veh batch1 AG p65 -J Veh batch2 AH p65 -K E2 batch1 AI p65 -L E2 batch2 AJ p65 -M TNFa batch1 AK p65 -N TNFa batch2 AL p65 -O E2_TNFa batch1 AM p65 -P E2_TNFa batch2 AN p65 -Q Veh batch1 AG FoxA1 -R Veh batch2 AH FoxA1 -S E2 batch1 AI FoxA1 -T E2 batch2 AJ FoxA1 -U TNFa batch1 AK FoxA1 -V TNFa batch2 AL FoxA1 -W E2_TNFa batch1 AM FoxA1 -X E2_TNFa batch2 AM FoxA1 +H Veh batch1 AG p65 +I Veh batch2 AH p65 +J E2 batch1 AI p65 +K E2 batch2 AJ p65 +L TNFa batch1 AK p65 +M TNFa batch2 AL p65 +N E2_TNFa batch1 AM p65 +O E2_TNFa batch2 AN p65 +P Veh batch1 AG FoxA1 +Q Veh batch2 AH FoxA1 +R E2 batch1 AI FoxA1 +S E2 batch2 AJ FoxA1 +T TNFa batch1 AK FoxA1 +U TNFa batch2 AL FoxA1 +V E2_TNFa batch1 AM FoxA1 +W E2_TNFa batch2 AM FoxA1 +X E2_TNFa batch1 AM ERa Y E2_TNFa batch1 AM ERa Z E2_TNFa batch1 AM ERa AA E2_TNFa batch1 AM ERa -AB E2_TNFa batch1 AM ERa +AB E2_TNFa batch2 AN ERa AC E2_TNFa batch2 AN ERa AD E2_TNFa batch2 AN ERa AE E2_TNFa batch2 AN ERa -AF E2_TNFa batch2 AN ERa -AG Veh batch1 -AH Veh batch2 -AI E2 batch1 -AJ E2 batch2 -AK TNFa batch1 -AL TNFa batch2 -AM E2_TNFa batch1 -AN E2_TNFa batch2 +AF Veh batch1 +AG Veh batch2 +AH E2 batch1 +AI E2 batch2 +AJ TNFa batch1 +AK TNFa batch2 +AL E2_TNFa batch1 +AM E2_TNFa batch2 diff --git a/.test/config/units.tsv b/.test/config/units.tsv index d106e5e..ede8e79 100644 --- a/.test/config/units.tsv +++ b/.test/config/units.tsv @@ -1,41 +1,41 @@ -sample unit fragment_len_mean fragment_len_sd fq1 fq2 sra_accession platform -A 1 SRR1635443 ILLUMINA -B 1 SRR1635444 ILLUMINA -C 1 300 14 SRR1635445 ILLUMINA -D 1 SRR1635446 ILLUMINA -E 1 SRR1635447 ILLUMINA -F 1 SRR1635448 ILLUMINA -G 1 SRR1635449 ILLUMINA -H 1 SRR1635450 ILLUMINA -I 1 SRR1635451 ILLUMINA -J 1 SRR1635452 ILLUMINA -K 1 SRR1635453 ILLUMINA -L 1 SRR1635454 ILLUMINA -M 1 SRR1635455 ILLUMINA -N 1 SRR1635456 ILLUMINA -O 1 SRR1635457 ILLUMINA -P 1 SRR1635458 ILLUMINA -Q 1 SRR1635459 ILLUMINA -R 1 SRR1635460 ILLUMINA -S 1 SRR1635461 ILLUMINA -T 1 SRR1635462 ILLUMINA -U 1 SRR1635463 ILLUMINA -V 1 SRR1635464 ILLUMINA -W 1 SRR1635465 ILLUMINA -X 1 SRR1635466 ILLUMINA -Y 1 SRR1635467 ILLUMINA -Z 1 SRR1635468 ILLUMINA -AA 1 SRR1635469 ILLUMINA -AB 1 SRR1635470 ILLUMINA -AC 1 SRR1635471 ILLUMINA -AD 1 SRR1635472 ILLUMINA -AE 1 SRR1635473 ILLUMINA -AF 1 SRR1635474 ILLUMINA -AG 1 SRR1635435 ILLUMINA -AH 1 SRR1635436 ILLUMINA -AI 1 SRR1635437 ILLUMINA -AJ 1 SRR1635438 ILLUMINA -AK 1 SRR1635439 ILLUMINA -AL 1 SRR1635440 ILLUMINA -AM 1 SRR1635441 ILLUMINA -AN 1 SRR1635442 ILLUMINA +sample unit fq1 fq2 sra_accession platform +A 1 SRR1635443 ILLUMINA +B 1 SRR1635444 ILLUMINA +C 1 SRR1635445 ILLUMINA +D 1 SRR1635446 ILLUMINA +E 1 SRR1635447 ILLUMINA +F 1 SRR1635448 ILLUMINA +G 1 SRR1635449 ILLUMINA +G 2 SRR1635450 ILLUMINA +H 1 SRR1635451 ILLUMINA +I 1 SRR1635452 ILLUMINA +J 1 SRR1635453 ILLUMINA +K 1 SRR1635454 ILLUMINA +L 1 SRR1635455 ILLUMINA +M 1 SRR1635456 ILLUMINA +N 1 SRR1635457 ILLUMINA +O 1 SRR1635458 ILLUMINA +P 1 SRR1635459 ILLUMINA +Q 1 SRR1635460 ILLUMINA +R 1 SRR1635461 ILLUMINA +S 1 SRR1635462 ILLUMINA +T 1 SRR1635463 ILLUMINA +U 1 SRR1635464 ILLUMINA +V 1 SRR1635465 ILLUMINA +W 1 SRR1635466 ILLUMINA +X 1 SRR1635467 ILLUMINA +Y 1 SRR1635468 ILLUMINA +Z 1 SRR1635469 ILLUMINA +AA 1 SRR1635470 ILLUMINA +AB 1 SRR1635471 ILLUMINA +AC 1 SRR1635472 ILLUMINA +AD 1 SRR1635473 ILLUMINA +AE 1 SRR1635474 ILLUMINA +AF 1 SRR1635435 ILLUMINA +AG 1 SRR1635436 ILLUMINA +AH 1 SRR1635437 ILLUMINA +AI 1 SRR1635438 ILLUMINA +AJ 1 SRR1635439 ILLUMINA +AK 1 SRR1635440 ILLUMINA +AL 1 SRR1635441 ILLUMINA +AM 1 SRR1635442 ILLUMINA diff --git a/.test/config_paired_end/config.yaml b/.test/config_paired_end/config.yaml index 1ef8de3..acfecff 100644 --- a/.test/config_paired_end/config.yaml +++ b/.test/config_paired_end/config.yaml @@ -1,4 +1,4 @@ -# This file should contain everything to configure the workflow on a global scale. +# This file should contains everything to configure the workflow on a global scale. # In case of sample based data, it should be complemented by a samples.tsv file that contains # one row per sample. It can be parsed easily via pandas. samples: "config_paired_end_reduced/samples.tsv" @@ -17,7 +17,7 @@ resources: release: 101 # Genome build build: R64-1-1 - # for testing data a single chromosome can be selected + # for testing data a single chromosome can be selected (leave empty for a regular analysis) chromosome: # specify release version number of igenomes list to use (see https://github.com/nf-core/chipseq/releases), e.g. 1.2.2 igenomes_release: 1.2.2 @@ -50,7 +50,7 @@ params: activate: True consensus-peak-analysis: activate: True - # samtools view parameters: + # samtools view parameter suggestions (for full parameters, see: https://www.htslib.org/doc/samtools-view.html): # if duplicates should be removed in this filtering, add "-F 0x0400" to the params # if for each read, you only want to retain a single (best) mapping, add "-q 1" to params # if you would like to restrict analysis to certain regions (e.g. excluding other "blacklisted" regions), @@ -60,7 +60,7 @@ params: samtools-view-se: "-b -F 0x004" samtools-view-pe: "-b -F 0x004 -G 0x009 -f 0x001" plotfingerprint: - # Number of bins that sampled from the genome, for which the overlapping number of reads is computed for fingerprint plot + # --numberOfSamples parameter of deeptools plotFingerprint, see: https://deeptools.readthedocs.io/en/develop/content/tools/plotFingerprint.html#Optional%20arguments number-of-samples: 500000 # optional parameters for picard's CollectMultipleMetrics from sorted, filtered and merged bam files in post analysis step # see https://gatk.broadinstitute.org/hc/en-us/articles/360037594031-CollectMultipleMetrics-Picard- diff --git a/.test/config_paired_end/units.tsv b/.test/config_paired_end/units.tsv index 2b3de2d..f36af94 100644 --- a/.test/config_paired_end/units.tsv +++ b/.test/config_paired_end/units.tsv @@ -1,7 +1,7 @@ -sample unit fragment_len_mean fragment_len_sd fq1 fq2 sra_accession platform -A 1 data/atacseq/test-datasets/testdata/SRR1822153_1.fastq.gz data/atacseq/test-datasets/testdata/SRR1822153_2.fastq.gz ILLUMINA -B 1 data/atacseq/test-datasets/testdata/SRR1822154_1.fastq.gz data/atacseq/test-datasets/testdata/SRR1822154_2.fastq.gz ILLUMINA -C 1 300 14 data/atacseq/test-datasets/testdata/SRR1822157_1.fastq.gz data/atacseq/test-datasets/testdata/SRR1822157_2.fastq.gz ILLUMINA -D 1 data/atacseq/test-datasets/testdata/SRR1822158_1.fastq.gz data/atacseq/test-datasets/testdata/SRR1822158_2.fastq.gz ILLUMINA -E 1 data/chipseq/test-datasets/testdata/SRR5204809_Spt5-ChIP_Input1_SacCer_ChIP-Seq_ss100k_R1.fastq.gz data/chipseq/test-datasets/testdata/SRR5204809_Spt5-ChIP_Input1_SacCer_ChIP-Seq_ss100k_R2.fastq.gz ILLUMINA -F 1 data/chipseq/test-datasets/testdata/SRR5204810_Spt5-ChIP_Input2_SacCer_ChIP-Seq_ss100k_R1.fastq.gz data/chipseq/test-datasets/testdata/SRR5204810_Spt5-ChIP_Input2_SacCer_ChIP-Seq_ss100k_R2.fastq.gz ILLUMINA +sample unit fq1 fq2 sra_accession platform +A 1 data/atacseq/test-datasets/testdata/SRR1822153_1.fastq.gz data/atacseq/test-datasets/testdata/SRR1822153_2.fastq.gz ILLUMINA +B 1 data/atacseq/test-datasets/testdata/SRR1822154_1.fastq.gz data/atacseq/test-datasets/testdata/SRR1822154_2.fastq.gz ILLUMINA +C 1 data/atacseq/test-datasets/testdata/SRR1822157_1.fastq.gz data/atacseq/test-datasets/testdata/SRR1822157_2.fastq.gz ILLUMINA +D 1 data/atacseq/test-datasets/testdata/SRR1822158_1.fastq.gz data/atacseq/test-datasets/testdata/SRR1822158_2.fastq.gz ILLUMINA +E 1 data/chipseq/test-datasets/testdata/SRR5204809_Spt5-ChIP_Input1_SacCer_ChIP-Seq_ss100k_R1.fastq.gz data/chipseq/test-datasets/testdata/SRR5204809_Spt5-ChIP_Input1_SacCer_ChIP-Seq_ss100k_R2.fastq.gz ILLUMINA +F 1 data/chipseq/test-datasets/testdata/SRR5204810_Spt5-ChIP_Input2_SacCer_ChIP-Seq_ss100k_R1.fastq.gz data/chipseq/test-datasets/testdata/SRR5204810_Spt5-ChIP_Input2_SacCer_ChIP-Seq_ss100k_R2.fastq.gz ILLUMINA diff --git a/.test/config_paired_end_reduced/config.yaml b/.test/config_paired_end_reduced/config.yaml index bfcc458..1e6f08f 100644 --- a/.test/config_paired_end_reduced/config.yaml +++ b/.test/config_paired_end_reduced/config.yaml @@ -1,4 +1,4 @@ -# This file should contain everything to configure the workflow on a global scale. +# This file should contains everything to configure the workflow on a global scale. # In case of sample based data, it should be complemented by a samples.tsv file that contains # one row per sample. It can be parsed easily via pandas. samples: "config_paired_end_reduced/samples.tsv" @@ -17,7 +17,7 @@ resources: release: 101 # Genome build build: R64-1-1 - # for testing data a specific chromosome can be selected + # for testing data a single chromosome can be selected (leave empty for a regular analysis) chromosome: VII # specify release version number of igenomes list to use (see https://github.com/nf-core/chipseq/releases), e.g. 1.2.2 igenomes_release: 1.2.2 @@ -50,7 +50,7 @@ params: activate: True consensus-peak-analysis: activate: True - # samtools view parameters: + # samtools view parameter suggestions (for full parameters, see: https://www.htslib.org/doc/samtools-view.html): # if duplicates should be removed in this filtering, add "-F 0x0400" to the params # if for each read, you only want to retain a single (best) mapping, add "-q 1" to params # if you would like to restrict analysis to certain regions (e.g. excluding other "blacklisted" regions), @@ -60,7 +60,7 @@ params: samtools-view-se: "-b -F 0x004" samtools-view-pe: "-b -F 0x004 -G 0x009 -f 0x001" plotfingerprint: - # Number of bins that sampled from the genome, for which the overlapping number of reads is computed for fingerprint plot + # --numberOfSamples parameter of deeptools plotFingerprint, see: https://deeptools.readthedocs.io/en/develop/content/tools/plotFingerprint.html#Optional%20arguments number-of-samples: 500000 # optional parameters for picard's CollectMultipleMetrics from sorted, filtered and merged bam files in post analysis step # see https://gatk.broadinstitute.org/hc/en-us/articles/360037594031-CollectMultipleMetrics-Picard- diff --git a/.test/config_paired_end_reduced/units.tsv b/.test/config_paired_end_reduced/units.tsv index 53324be..87687ea 100644 --- a/.test/config_paired_end_reduced/units.tsv +++ b/.test/config_paired_end_reduced/units.tsv @@ -1,6 +1,6 @@ -sample unit fragment_len_mean fragment_len_sd fq1 fq2 sra_accession platform -A 1 data/paired_end_test_data/A-1_vii_1.fastq.gz data/paired_end_test_data/A-1_vii_2.fastq.gz ILLUMINA -B 1 data/paired_end_test_data/B-1_vii_1.fastq.gz data/paired_end_test_data/B-1_vii_2.fastq.gz ILLUMINA -C 1 300 14 data/paired_end_test_data/C-1_vii_1.fastq.gz data/paired_end_test_data/C-1_vii_2.fastq.gz ILLUMINA -D 1 data/paired_end_test_data/D-1_vii_1.fastq.gz data/paired_end_test_data/D-1_vii_2.fastq.gz ILLUMINA -E 1 data/paired_end_test_data/E-1_vii_1.fastq.gz data/paired_end_test_data/E-1_vii_2.fastq.gz ILLUMINA +sample unit fq1 fq2 sra_accession platform +A 1 data/paired_end_test_data/A-1_vii_1.fastq.gz data/paired_end_test_data/A-1_vii_2.fastq.gz ILLUMINA +B 1 data/paired_end_test_data/B-1_vii_1.fastq.gz data/paired_end_test_data/B-1_vii_2.fastq.gz ILLUMINA +C 1 data/paired_end_test_data/C-1_vii_1.fastq.gz data/paired_end_test_data/C-1_vii_2.fastq.gz ILLUMINA +D 1 data/paired_end_test_data/D-1_vii_1.fastq.gz data/paired_end_test_data/D-1_vii_2.fastq.gz ILLUMINA +E 1 data/paired_end_test_data/E-1_vii_1.fastq.gz data/paired_end_test_data/E-1_vii_2.fastq.gz ILLUMINA diff --git a/.test/config_single_end/config.yaml b/.test/config_single_end/config.yaml index b239756..c376453 100644 --- a/.test/config_single_end/config.yaml +++ b/.test/config_single_end/config.yaml @@ -1,4 +1,4 @@ -# This file should contain everything to configure the workflow on a global scale. +# This file should contains everything to configure the workflow on a global scale. # In case of sample based data, it should be complemented by a samples.tsv file that contains # one row per sample. It can be parsed easily via pandas. samples: "config_single_end/samples.tsv" @@ -20,7 +20,7 @@ resources: release: 101 # Genome build build: GRCh38 - # for testing data a specific chromosome can be selected + # for testing data a single chromosome can be selected (leave empty for a regular analysis) chromosome: # specify release version number of igenomes list to use (see https://github.com/nf-core/chipseq/releases), e.g. 1.2.2 igenomes_release: 1.2.2 @@ -53,7 +53,7 @@ params: activate: True consensus-peak-analysis: activate: True - # samtools view parameters: + # samtools view parameter suggestions (for full parameters, see: https://www.htslib.org/doc/samtools-view.html): # if duplicates should be removed in this filtering, add "-F 0x0400" to the params # if for each read, you only want to retain a single (best) mapping, add "-q 1" to params # if you would like to restrict analysis to certain regions (e.g. excluding other "blacklisted" regions), @@ -63,7 +63,7 @@ params: samtools-view-se: "-b -F 0x004" samtools-view-pe: "-b -F 0x004 -G 0x009 -f 0x001" plotfingerprint: - # Number of bins that sampled from the genome, for which the overlapping number of reads is computed for fingerprint plot + # --numberOfSamples parameter of deeptools plotFingerprint, see: https://deeptools.readthedocs.io/en/develop/content/tools/plotFingerprint.html#Optional%20arguments number-of-samples: 500000 # optional parameters for picard's CollectMultipleMetrics from sorted, filtered and merged bam files in post analysis step # see https://gatk.broadinstitute.org/hc/en-us/articles/360037594031-CollectMultipleMetrics-Picard- diff --git a/.test/config_single_end/units.tsv b/.test/config_single_end/units.tsv index bbd8e31..f18818d 100644 --- a/.test/config_single_end/units.tsv +++ b/.test/config_single_end/units.tsv @@ -1,7 +1,7 @@ -sample unit fragment_len_mean fragment_len_sd fq1 fq2 sra_accession platform -A 1 SRR1635455 ILLUMINA -B 1 SRR1635456 ILLUMINA -C 1 SRR1635457 ILLUMINA -C 2 SRR1635458 ILLUMINA -D 1 SRR1635439 ILLUMINA -E 1 SRR1635441 ILLUMINA +sample unit fq1 fq2 sra_accession platform +A 1 SRR1635455 ILLUMINA +B 1 SRR1635456 ILLUMINA +C 1 SRR1635457 ILLUMINA +C 2 SRR1635458 ILLUMINA +D 1 SRR1635439 ILLUMINA +E 1 SRR1635441 ILLUMINA diff --git a/.test/config_single_end_reduced/config.yaml b/.test/config_single_end_reduced/config.yaml index 1870c13..c7420b2 100644 --- a/.test/config_single_end_reduced/config.yaml +++ b/.test/config_single_end_reduced/config.yaml @@ -1,4 +1,4 @@ -# This file should contain everything to configure the workflow on a global scale. +# This file should contains everything to configure the workflow on a global scale. # In case of sample based data, it should be complemented by a samples.tsv file that contains # one row per sample. It can be parsed easily via pandas. samples: "config_single_end_reduced/samples.tsv" @@ -20,7 +20,7 @@ resources: release: 101 # Genome build build: GRCh38 - # for testing data a specific chromosome can be selected + # for testing data a single chromosome can be selected (leave empty for a regular analysis) chromosome: 21 # specify release version number of igenomes list to use (see https://github.com/nf-core/chipseq/releases), e.g. 1.2.2 igenomes_release: 1.2.2 @@ -53,7 +53,7 @@ params: activate: True consensus-peak-analysis: activate: True - # samtools view parameters: + # samtools view parameter suggestions (for full parameters, see: https://www.htslib.org/doc/samtools-view.html): # if duplicates should be removed in this filtering, add "-F 0x0400" to the params # if for each read, you only want to retain a single (best) mapping, add "-q 1" to params # if you would like to restrict analysis to certain regions (e.g. excluding other "blacklisted" regions), @@ -63,7 +63,7 @@ params: samtools-view-se: "-b -F 0x004" samtools-view-pe: "-b -F 0x004 -G 0x009 -f 0x001" plotfingerprint: - # Number of bins that sampled from the genome, for which the overlapping number of reads is computed for fingerprint plot + # --numberOfSamples parameter of deeptools plotFingerprint, see: https://deeptools.readthedocs.io/en/develop/content/tools/plotFingerprint.html#Optional%20arguments number-of-samples: 500000 # optional parameters for picard's CollectMultipleMetrics from sorted, filtered and merged bam files in post analysis step # see https://gatk.broadinstitute.org/hc/en-us/articles/360037594031-CollectMultipleMetrics-Picard- diff --git a/.test/config_single_end_reduced/units.tsv b/.test/config_single_end_reduced/units.tsv index a652722..0ddd560 100644 --- a/.test/config_single_end_reduced/units.tsv +++ b/.test/config_single_end_reduced/units.tsv @@ -1,7 +1,7 @@ -sample unit fragment_len_mean fragment_len_sd fq1 fq2 sra_accession platform -A 1 data/single_end_test_data/A-1_chr21.fastq.gz ILLUMINA -B 1 data/single_end_test_data/B-1_chr21.fastq.gz ILLUMINA -C 1 data/single_end_test_data/C-1_chr21.fastq.gz ILLUMINA -C 2 data/single_end_test_data/C-2_chr21.fastq.gz ILLUMINA -D 1 data/single_end_test_data/D-1_chr21.fastq.gz ILLUMINA -E 1 data/single_end_test_data/E-1_chr21.fastq.gz ILLUMINA +sample unit fq1 fq2 sra_accession platform +A 1 data/single_end_test_data/A-1_chr21.fastq.gz ILLUMINA +B 1 data/single_end_test_data/B-1_chr21.fastq.gz ILLUMINA +C 1 data/single_end_test_data/C-1_chr21.fastq.gz ILLUMINA +C 2 data/single_end_test_data/C-2_chr21.fastq.gz ILLUMINA +D 1 data/single_end_test_data/D-1_chr21.fastq.gz ILLUMINA +E 1 data/single_end_test_data/E-1_chr21.fastq.gz ILLUMINA diff --git a/config/README.md b/config/README.md index 2f5541b..ed07b51 100644 --- a/config/README.md +++ b/config/README.md @@ -5,7 +5,7 @@ To configure this workflow, modify ``config/config.yaml`` according to your need # Sample sheet Add samples to `config/samples.tsv`. For each sample, the columns `sample`, `group`, `control`, and `antibody` have to be defined. -* Samples / IP (immunoprecipitations) within the same `group` represents replicates and must have the same antibody and the same control. +* Samples / IP (immunoprecipitations) within the same `group` represent replicates and must have the same antibody and the same control. * Controls / Input are listed like samples, but they do not have entries in the columns for `control` and `antibody`. * The identifiers of each control has to be noted in the column `sample`. * For all samples, the identifiers of the corresponding controls have to be given in the `control` column (see example below). diff --git a/config/config.yaml b/config/config.yaml index 1fe58de..8e840fb 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -1,8 +1,8 @@ -# This file contain everything to configure the workflow on a global scale. +# This file contains everything to configure the workflow on a global scale. # The sample based data must be complemented by a samples.tsv file that contains # one row per sample. It can be parsed easily via pandas. samples: "config/samples.tsv" -# to download reads from SRA the accession numbers (see https://www.ncbi.nlm.nih.gov/sra) of samples must be given in units.tsv +# The source of fastq files for every sequencing unit of all samples has to be provided in the units.tsv file. units: "config/units.tsv" single_end: False @@ -17,7 +17,7 @@ resources: release: 101 # Genome build build: R64-1-1 - # for testing data a specific chromosome can be selected + # for testing data a single chromosome can be selected (leave empty for a regular analysis) chromosome: # specify release version number of igenomes list to use (see https://github.com/nf-core/chipseq/releases), e.g. 1.2.2 igenomes_release: 1.2.2 @@ -50,7 +50,7 @@ params: activate: True consensus-peak-analysis: activate: True - # samtools view parameters: + # samtools view parameter suggestions (for full parameters, see: https://www.htslib.org/doc/samtools-view.html): # if duplicates should be removed in this filtering, add "-F 0x0400" to the params # if for each read, you only want to retain a single (best) mapping, add "-q 1" to params # if you would like to restrict analysis to certain regions (e.g. excluding other "blacklisted" regions), @@ -60,7 +60,7 @@ params: samtools-view-se: "-b -F 0x004" samtools-view-pe: "-b -F 0x004 -G 0x009 -f 0x001" plotfingerprint: - # Number of bins that sampled from the genome, for which the overlapping number of reads is computed for fingerprint plot + # --numberOfSamples parameter of deeptools plotFingerprint, see: https://deeptools.readthedocs.io/en/develop/content/tools/plotFingerprint.html#Optional%20arguments number-of-samples: 500000 # optional parameters for picard's CollectMultipleMetrics from sorted, filtered and merged bam files in post analysis step # see https://gatk.broadinstitute.org/hc/en-us/articles/360037594031-CollectMultipleMetrics-Picard- diff --git a/config/units.tsv b/config/units.tsv index 3e27f79..2907c2d 100644 --- a/config/units.tsv +++ b/config/units.tsv @@ -1 +1 @@ -sample unit fragment_len_mean fragment_len_sd fq1 fq2 sra_accession platform +sample unit fq1 fq2 sra_accession platform diff --git a/workflow/report/multiqc_report.rst b/workflow/report/multiqc_report.rst index 5a14a9b..a409cbb 100644 --- a/workflow/report/multiqc_report.rst +++ b/workflow/report/multiqc_report.rst @@ -1,3 +1,2 @@ -**MultiQC** report is a collection of multiple plots and stats from Chip-seq processing pipeline (phantompeakqualtools), -Preseq analysis, metrics from Picard and Samtools. There are also quality control metrics and plots from FastQC analysis. -Detailed descriptions of the individual plots and statistics can be found in MultiQC report. +The **MultiQC** report is a collection of multiple plots, stats and metrics from phantompeakqualtools (Chip-seq processing), Preseq, Picard, Samtools and FastQC. +For detailed descriptions of the individual plots and statistics, load the MultiQC report by clicking on it. diff --git a/workflow/report/plot_annotatepeaks_homer.rst b/workflow/report/plot_annotatepeaks_homer.rst index 1ed0e62..251eb30 100644 --- a/workflow/report/plot_annotatepeaks_homer.rst +++ b/workflow/report/plot_annotatepeaks_homer.rst @@ -1,3 +1,2 @@ -`Homer annotatePeaks `_ is used to annotate the peaks relative -to known genomic features. The plots of this analysis show for all samples and their associated controls peak location -relative to annotation, percentage of unique genes to closest peak an peak distribution relative to TSS (Transcription Start Site). +`Homer annotatePeaks `_ assigns known genomic features to peaks. +For each sample-control pair, plots show the peak locations relative to annotated features, the percentage of unique genes to closest peak and the peak distribution relative to TSS (Transcription Start Site). diff --git a/workflow/report/plot_base_distribution_by_cycle_picard_mm.rst b/workflow/report/plot_base_distribution_by_cycle_picard_mm.rst index 016bd55..1dbc07d 100644 --- a/workflow/report/plot_base_distribution_by_cycle_picard_mm.rst +++ b/workflow/report/plot_base_distribution_by_cycle_picard_mm.rst @@ -1,8 +1,8 @@ -`Base distribution by cycle plot -`_ **(Picard)** is -used as quality control for alignment-level and shows the nucleotide distribution per cycle of the bam files after -filtering, sorting, merging and removing orphans. For any cycle within reads the relative proportions of nucleotides -should reflect the AT:CG content. For all nucleotides flattish lines would be expected and any spikes would suggest a -systematic sequencing error. For more information about `collected Picard metrics +`Plot of base distribution per sequencing cycle +`_. +This **Picard** tool shows the nucleotide distribution per sequencing cycle of the bam files after filtering, sorting, merging and removing orphans. +For any sequencing cycle within reads, the relative proportions of nucleotides should reflect the AT:CG content. +For all nucleotides, flat lines would be expected and any spikes suggest a systematic sequencing error. +For more information about `collected Picard metrics `_ please -see `documentation `_. +see the `documentation `_. diff --git a/workflow/report/plot_consensus_peak_intersect.rst b/workflow/report/plot_consensus_peak_intersect.rst index e6d5864..c40e70e 100644 --- a/workflow/report/plot_consensus_peak_intersect.rst +++ b/workflow/report/plot_consensus_peak_intersect.rst @@ -1,2 +1,2 @@ **MACS2 and bedtools** merged consensus {{ snakemake.wildcards.peak }} peaks plot is generated by calculating the -proportion of intersection size assigned to {{ snakemake.wildcards.samples }} for {{ snakemake.wildcards.antibody }}. +proportion of intersection size assigned to all samples for {{ snakemake.wildcards.antibody }}. diff --git a/workflow/report/plot_deseq2_FDR_1_perc_MA.rst b/workflow/report/plot_deseq2_FDR_1_perc_MA.rst index 6f3b891..80cba6a 100644 --- a/workflow/report/plot_deseq2_FDR_1_perc_MA.rst +++ b/workflow/report/plot_deseq2_FDR_1_perc_MA.rst @@ -1,6 +1,7 @@ -`MA plot `_ **(FDR 0.01)** -shows the log2 fold changes versus the mean of normalized counts from -`DESeq2 `_ analysis filtered on a false -discovery rate (FDR) threshold of 0.01 for pairwise comparisons of samples across the groups from a particular antibody. -For more information about DESeq2 please see +The `MA plot `_ **(FDR 0.01)** +displays the log2 fold changes versus the mean of normalized counts of the +DESeq2 analysis results. +The results of this plot are filtered on a false discovery rate (FDR) threshold of 0.01 and represent the comparison of the +{{snakemake.wildcards["group_1"]}} versus {{snakemake.wildcards["group_2"]}} groups treated with the +{{snakemake.wildcards["antibody"]}} antibody. For more information about DESeq2 please see `documentation `_. diff --git a/workflow/report/plot_deseq2_FDR_1_perc_volcano.rst b/workflow/report/plot_deseq2_FDR_1_perc_volcano.rst index 3fa72b2..beb1c59 100644 --- a/workflow/report/plot_deseq2_FDR_1_perc_volcano.rst +++ b/workflow/report/plot_deseq2_FDR_1_perc_volcano.rst @@ -1,6 +1,8 @@ -**Volcano plot (FDR 0.01)** shows the significance (adjusted p-value) versus the log2 fold changes of the results of the -`DESeq2 `_ analysis filtered on a false -discovery rate (FDR) threshold of 0.01 for pairwise comparisons of samples across the groups from a particular antibody. -For more information about DESeq2 please see +**Volcano plot (FDR 0.01)** shows the significance (adjusted p-value) versus the log2 fold changes of the +DESeq2 analysis results. +The results of this plot are filtered on a false discovery rate (FDR) threshold of 0.01 and represent the comparison of the +{{snakemake.wildcards["group_1"]}} versus {{snakemake.wildcards["group_2"]}} groups for the +{{snakemake.wildcards["antibody"]}} antibody. For more information about DESeq2 please see `documentation `_. + diff --git a/workflow/report/plot_deseq2_FDR_5_perc_MA.rst b/workflow/report/plot_deseq2_FDR_5_perc_MA.rst index dca55a6..c966585 100644 --- a/workflow/report/plot_deseq2_FDR_5_perc_MA.rst +++ b/workflow/report/plot_deseq2_FDR_5_perc_MA.rst @@ -1,8 +1,8 @@ -`MA plot `_ **(FDR 0.05)** -shows the log2 fold changes versus the mean of normalized counts from -`DESeq2 `_ analysis filtered on a false -discovery rate (FDR) threshold of 0.05 for pairwise comparisons of samples across the groups from a particular antibody. -For more information about DESeq2 please see +The `MA plot `_ **(FDR 0.05)** +displays the log2 fold changes versus the mean of normalized counts of the +DESeq2 analysis results. +The results of this plot are filtered on a false discovery rate (FDR) threshold of 0.05 and represent the comparison of the +{{snakemake.wildcards["group_1"]}} versus {{snakemake.wildcards["group_2"]}} groups for the +{{snakemake.wildcards["antibody"]}} antibody. For more information about DESeq2 please see `documentation `_. - diff --git a/workflow/report/plot_deseq2_FDR_5_perc_volcano.rst b/workflow/report/plot_deseq2_FDR_5_perc_volcano.rst index f6fe20a..c60a082 100644 --- a/workflow/report/plot_deseq2_FDR_5_perc_volcano.rst +++ b/workflow/report/plot_deseq2_FDR_5_perc_volcano.rst @@ -1,5 +1,6 @@ -**Volcano plot (FDR 0.05)** shows the significance (adjusted p-value) versus the log2 fold changes of the results of the -`DESeq2 `_ analysis filtered on a false -discovery rate (FDR) threshold of 0.05 for pairwise comparisons of samples across the groups from a particular antibody. -For more information about DESeq2 please see +**Volcano plot (FDR 0.05)** shows the significance (adjusted p-value) versus the log2 fold changes of the +DESeq2 analysis results. +The results of this plot are filtered on a false discovery rate (FDR) threshold of 0.05 and represent the comparison of the +{{snakemake.wildcards["group_1"]}} versus {{snakemake.wildcards["group_2"]}} groups for the +{{snakemake.wildcards["antibody"]}} antibody. For more information about DESeq2 please see `documentation `_. diff --git a/workflow/report/plot_deseq2_heatmap.rst b/workflow/report/plot_deseq2_heatmap.rst index 599c234..0ed11a1 100644 --- a/workflow/report/plot_deseq2_heatmap.rst +++ b/workflow/report/plot_deseq2_heatmap.rst @@ -1,6 +1,6 @@ -`Heatmap plot `_ shows for each antibody a heatmap of -the `rlog `_ or +`Heatmap plot `_ shows a heatmap of the `rlog `_ or `vst `_ transformed counts from -DESeq2 analysis. For more information about DESeq2 please see +DESeq2 analysis for all groups treated with the {{snakemake.wildcards["antibody"]}} antibody +. For more information about DESeq2 please see `documentation `_. diff --git a/workflow/report/plot_deseq2_pca.rst b/workflow/report/plot_deseq2_pca.rst index db724a3..bce9d04 100644 --- a/workflow/report/plot_deseq2_pca.rst +++ b/workflow/report/plot_deseq2_pca.rst @@ -2,5 +2,5 @@ **(Principal Component Analysis)** describes variance of the `rlog `_ or `vst `_ transformed counts from -DESeq2 analysis with regard to the antibody used. For more information about DESeq2 please see +DESeq2 analysis with regard to the used {{snakemake.wildcards["antibody"]}} antibody. For more information about DESeq2 please see `documentation `_. diff --git a/workflow/report/plot_deseq2_sample_corr_heatmap.rst b/workflow/report/plot_deseq2_sample_corr_heatmap.rst index 7f91af9..1061e70 100644 --- a/workflow/report/plot_deseq2_sample_corr_heatmap.rst +++ b/workflow/report/plot_deseq2_sample_corr_heatmap.rst @@ -1,5 +1,7 @@ Correlation `heatmap plots `_ shows heatmaps of the `rlog `_ or `vst `_ transformed counts from -DESeq2 analysis for each pairwise comparison of samples across the groups from a particular antibody. For more information about DESeq2 please +the DESeq2 analysis. The results of this plot represent the comparison of the +{{snakemake.wildcards["group_1"]}} versus {{snakemake.wildcards["group_2"]}} groups treated with the +{{snakemake.wildcards["antibody"]}} antibody. For more information about DESeq2 please see `documentation `_. diff --git a/workflow/report/plot_deseq2_scatter.rst b/workflow/report/plot_deseq2_scatter.rst index 6a8cc6f..74c9f90 100644 --- a/workflow/report/plot_deseq2_scatter.rst +++ b/workflow/report/plot_deseq2_scatter.rst @@ -1,5 +1,8 @@ **Scatter plots** uses the matrix of the `rlog `_ or `vst `_ transformed counts from -DESeq2 analysis for pairwise comparisons of samples across the groups from a particular antibody. For more information about DESeq2 please see -`documentation `_. +the DESeq2 analysis. The results of this plot represent the comparison of the +{{snakemake.wildcards["group_1"]}} versus {{snakemake.wildcards["group_2"]}} groups treated with the +{{snakemake.wildcards["antibody"]}} antibody. For more information about DESeq2 please +see `documentation `_. + diff --git a/workflow/report/plot_heatmap_deeptools.rst b/workflow/report/plot_heatmap_deeptools.rst index 68b871e..1c94b2a 100644 --- a/workflow/report/plot_heatmap_deeptools.rst +++ b/workflow/report/plot_heatmap_deeptools.rst @@ -1,4 +1,2 @@ **deepTools** `plotHeatmap `_ creates a -heatmap for scores over sets of genomic regions and gives a visualisation for the genome-wide enrichment of the samples. -The `scores `_ represent the signal over a -set of regions where all regions are scaled to the same size. +heatmap of read coverage over sets of genomic regions and gives a visualisation for the genome-wide enrichment of the samples. diff --git a/workflow/report/plot_macs2_qc.rst b/workflow/report/plot_macs2_qc.rst index bb8a330..73dba23 100644 --- a/workflow/report/plot_macs2_qc.rst +++ b/workflow/report/plot_macs2_qc.rst @@ -1,6 +1,7 @@ `MACS2 `_ **callpeak quality control plots** show for all -samples and their associated controls the number of peaks and their distributions of peak length, fold-change, FDR and -p-value from the results of the +samples and their associated controls the number of peaks and their distributions of peak length, the +`fold-change `_ of +the enrichment for their peak summit against random Poisson distribution, FDR and p-value from the results of the `MACS callpeak analysis `_. For more information about Model-based Analysis of ChIP-Seq (MACS) please see published `article `_. diff --git a/workflow/report/plot_profile_deeptools.rst b/workflow/report/plot_profile_deeptools.rst index 76ccd02..a88643a 100644 --- a/workflow/report/plot_profile_deeptools.rst +++ b/workflow/report/plot_profile_deeptools.rst @@ -1,4 +1,3 @@ **deepTools** `plotProfile `_ creates a -profile plot for scores over sets of genomic regions and gives a visualisation for the genome-wide enrichment of the -samples. The `scores `_ represent the -signal over a set of regions where all regions are scaled to the same size. +profile plot of read coverage over sets of genomic regions and gives a visualisation for the genome-wide enrichment of the +samples. diff --git a/workflow/report/plot_quality_by_cycle_picard_mm.rst b/workflow/report/plot_quality_by_cycle_picard_mm.rst index b8b8a61..63bb520 100644 --- a/workflow/report/plot_quality_by_cycle_picard_mm.rst +++ b/workflow/report/plot_quality_by_cycle_picard_mm.rst @@ -1,7 +1,7 @@ `Mean quality by cycle plot `_ **(Picard)** is used as quality control for sequencing machine performance and collects the mean quality by cycle of the bam files after -filtering, sorting, merging and removing orphans. Any spikes in quality within reads may indicate technical problems +filtering, sorting, merging and removing orphans. Clear drops in quality within reads may indicate technical problems during sequencing. For more information about `collected Picard metrics `_ please see `documentation `_. diff --git a/workflow/rules/consensus_peak_analysis.smk b/workflow/rules/consensus_peak_analysis.smk index 6a7057f..5cb4ce1 100644 --- a/workflow/rules/consensus_peak_analysis.smk +++ b/workflow/rules/consensus_peak_analysis.smk @@ -194,39 +194,38 @@ rule featurecounts_deseq2: igv_FDR_5_bed="results/IGV/consensus/merged_library.{antibody}.consensus_{peak}-peaks.deseq2.FDR_0.05.igv.txt", plot_FDR_1_perc_MA=report( directory("results/deseq2/comparison_plots/MA_plots/FDR_0.01_{antibody}consensus_{peak}-peaks"), - patterns=["{antibody}.{{group_1_vs_group_2}}.MA-plot_FDR_0.01.pdf"], + patterns=["{antibody}.{group_1}_{antibody_1}_{control_1}vs{group_2}_{antibody_2}_{control_2}.MA-plot_FDR_0.01.pdf"], caption = "../report/plot_deseq2_FDR_1_perc_MA.rst", category = "DESeq2"), plot_FDR_5_perc_MA=report( directory("results/deseq2/comparison_plots/MA_plots/FDR_0.05_{antibody}consensus_{peak}-peaks"), - patterns=["{antibody}.{{group_1_vs_group_2}.MA-plot_FDR_0.05.pdf"], + patterns=["{antibody}.{group_1}_{antibody_1}_{control_1}vs{group_2}_{antibody_2}_{control_2}.MA-plot_FDR_0.05.pdf"], caption = "../report/plot_deseq2_FDR_5_perc_MA.rst", category = "DESeq2"), plot_FDR_1_perc_volcano=report( directory("results/deseq2/comparison_plots/volcano_plots/FDR_0.01_{antibody}consensus_{peak}-peaks"), - patterns=["{antibody}.{{group_1_vs_group_2}}.volcano-plot_FDR_0.01.pdf"], + patterns=["{antibody}.{group_1}_{antibody_1}_{control_1}vs{group_2}_{antibody_2}_{control_2}.volcano_FDR_0.01.pdf"], caption = "../report/plot_deseq2_FDR_1_perc_volcano.rst", category = "DESeq2"), plot_FDR_5_perc_volcano=report( directory("results/deseq2/comparison_plots/volcano_plots/FDR_0.05_{antibody}consensus_{peak}-peaks"), - patterns=["{antibody}.{{group_1_vs_group_2}}.volcano-plot_FDR_0.05.pdf"], + patterns=["{antibody}.{group_1}_{antibody_1}_{control_1}vs{group_2}_{antibody_2}_{control_2}.volcano_FDR_0.05.pdf"], caption = "../report/plot_deseq2_FDR_5_perc_volcano.rst", category = "DESeq2"), plot_sample_corr_heatmap=report( directory("results/deseq2/comparison_plots/correlation_heatmaps_{antibody}consensus_{peak}-peaks"), - patterns=["{antibody}.{{group_1_vs_group_2}}.correlation_heatmap.pdf"], + patterns=["{antibody}.{group_1}_{antibody_1}_{control_1}vs{group_2}_{antibody_2}_{control_2}.correlation_heatmap.pdf"], caption = "../report/plot_deseq2_sample_corr_heatmap.rst", category = "DESeq2"), plot_scatter=report( directory("results/deseq2/comparison_plots/scatter_plots_{antibody}consensus_{peak}-peaks"), - patterns=["{antibody}.{{group_1_vs_group_2}}.scatter_plots.pdf"], + patterns=["{antibody}.{group_1}_{antibody_1}_{control_1}vs{group_2}_{antibody_2}_{control_2}.scatter_plots.pdf"], caption = "../report/plot_deseq2_scatter.rst", category = "DESeq2") threads: 2 params: - vst = config["params"]["deseq2"]["vst"], - antibody = lambda w: w.antibody + vst = config["params"]["deseq2"]["vst"] log: "logs/deseq2/{antibody}.consensus_{peak}-peaks.featureCounts.log" conda: diff --git a/workflow/rules/peak_analysis.smk b/workflow/rules/peak_analysis.smk index 1ebfb96..8c110ae 100644 --- a/workflow/rules/peak_analysis.smk +++ b/workflow/rules/peak_analysis.smk @@ -11,7 +11,7 @@ rule plot_fingerprint: fingerprint=report( "results/deeptools/{sample}-{control}.plot_fingerprint.pdf", caption="../report/plot_fingerprint_deeptools.rst", - category="QC"), + category="other QC"), counts="results/deeptools/{sample}-{control}.fingerprint_counts.txt", qc_metrics="results/deeptools/{sample}-{control}.fingerprint_qcmetrics.txt" log: diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index 5314466..d0b3480 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -16,7 +16,7 @@ rule multiqc: input: get_multiqc_input output: - report("results/qc/multiqc/multiqc.html", caption="../report/multiqc_report.rst", category="QC") + report("results/qc/multiqc/multiqc.html", caption="../report/multiqc_report.rst", category="MultiQC") log: "logs/multiqc.log" wrapper: diff --git a/workflow/scripts/featurecounts_deseq2.R b/workflow/scripts/featurecounts_deseq2.R index 7244bcf..1f96ad7 100644 --- a/workflow/scripts/featurecounts_deseq2.R +++ b/workflow/scripts/featurecounts_deseq2.R @@ -285,7 +285,7 @@ if (file.exists(ResultsFile) == FALSE) { ## WRITE RESULTS FILE if (MIN_FDR == 0.01) { - # AVI: dynamically file name extensions added + # AVI: dynamically generated file name extensions added dirs <- c(snakemake@output[["FDR_1_perc_res"]],snakemake@output[["FDR_1_perc_bed"]],snakemake@output[["plot_FDR_1_perc_MA"]], snakemake@output[["plot_FDR_1_perc_volcano"]],snakemake@output[["FDR_5_perc_res"]],snakemake@output[["FDR_5_perc_bed"]], snakemake@output[["plot_FDR_5_perc_MA"]],snakemake@output[["plot_FDR_5_perc_volcano"]],snakemake@output[["plot_sample_corr_heatmap"]], @@ -296,19 +296,19 @@ if (file.exists(ResultsFile) == FALSE) { } } - CompResultsFile <- paste(snakemake@output[["FDR_1_perc_res"]], "/", snakemake@params[["antibody"]], ".", CompPrefix, ".FDR_0.01_results.txt", sep="") - CompBEDFile <- paste(snakemake@output[["FDR_1_perc_bed"]], "/", snakemake@params[["antibody"]], ".", CompPrefix, ".FDR_0.01_results.bed", sep="") - MAplotFile <- paste(snakemake@output[["plot_FDR_1_perc_MA"]], "/", snakemake@params[["antibody"]], ".", CompPrefix, ".MA-plot_FDR_0.01.pdf", sep="") # AVI: added to create separate pdf files - VolcanoPlotFile <- paste(snakemake@output[["plot_FDR_1_perc_volcano"]], "/", snakemake@params[["antibody"]], ".", CompPrefix, ".volcano_FDR_0.01.pdf", sep="") # AVI: added to create separate pdf files + CompResultsFile <- file.path(snakemake@output[["FDR_1_perc_res"]], paste(snakemake@params[["antibody"]], ".", CompPrefix, ".FDR_0.01_results.txt", sep="")) + CompBEDFile <- file.path(snakemake@output[["FDR_1_perc_bed"]], paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".FDR_0.01_results.bed", sep="")) + MAplotFile <- file.path(snakemake@output[["plot_FDR_1_perc_MA"]], paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".MA-plot_FDR_0.01.pdf", sep="")) # AVI: added to create separate pdf files + VolcanoPlotFile <- file.path(snakemake@output[["plot_FDR_1_perc_volcano"]], paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".volcano_FDR_0.01.pdf", sep="")) # AVI: added to create separate pdf files } if (MIN_FDR == 0.05) { # AVI: dynamically file name extensions added - CompResultsFile <- paste(snakemake@output[["FDR_5_perc_res"]], "/", snakemake@params[["antibody"]], ".", CompPrefix, ".FDR_0.05_results.txt", sep="") - CompBEDFile <- paste(snakemake@output[["FDR_5_perc_bed"]], "/", snakemake@params[["antibody"]], ".", CompPrefix, ".FDR_0.05_results.bed", sep="") + CompResultsFile <- file.path(snakemake@output[["FDR_5_perc_res"]], paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".FDR_0.05_results.txt", sep="")) + CompBEDFile <- file.path(snakemake@output[["FDR_5_perc_bed"]], paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".FDR_0.05_results.bed", sep="")) # AVI: write paths of FDR 0.05 bed files to igv file cat(paste0(CompBEDFile, "\t255,0,0\n"),file=snakemake@output[["igv_FDR_5_bed"]],append=TRUE) - MAplotFile <- paste(snakemake@output[["plot_FDR_5_perc_MA"]], "/", snakemake@params[["antibody"]], ".", CompPrefix, ".MA-plot_FDR_0.05.pdf", sep="") # AVI: added to create separate pdf files - VolcanoPlotFile <- paste(snakemake@output[["plot_FDR_5_perc_volcano"]], "/", snakemake@params[["antibody"]], ".", CompPrefix, ".volcano_FDR_0.05.pdf", sep="") # AVI: added to create separate pdf files + MAplotFile <- file.path(snakemake@output[["plot_FDR_5_perc_MA"]], paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".MA-plot_FDR_0.05.pdf", sep="")) # AVI: added to create separate pdf files + VolcanoPlotFile <- file.path(snakemake@output[["plot_FDR_5_perc_volcano"]], paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".volcano_FDR_0.05.pdf", sep="")) # AVI: added to create separate pdf files } write.table(pass.fdr.table, file=CompResultsFile, col.names=TRUE, row.names=FALSE, sep='\t', quote=FALSE) @@ -335,7 +335,7 @@ if (file.exists(ResultsFile) == FALSE) { } ## SAMPLE CORRELATION HEATMAP - CorrHeatmapFile <- paste(snakemake@output[["plot_sample_corr_heatmap"]], "/", snakemake@params[["antibody"]], ".", CompPrefix, ".correlation_heatmap.pdf", sep="") # AVI: dynamically file name extensions added + CorrHeatmapFile <- file.path(snakemake@output[["plot_sample_corr_heatmap"]], paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".correlation_heatmap.pdf", sep="")) # AVI: dynamically file name extensions added pdf(file=CorrHeatmapFile,width=10,height=8) # AVI: added to create separate pdf files rld.subset <- assay(rld)[,comp.samples] sampleDists <- dist(t(rld.subset)) @@ -345,7 +345,7 @@ if (file.exists(ResultsFile) == FALSE) { dev.off() # AVI: added to create separate pdf files ## SCATTER PLOT FOR RLOG COUNTS - ScatterPlotFile <- paste(snakemake@output[["plot_scatter"]], "/", snakemake@params[["antibody"]], ".", CompPrefix, ".scatter_plots.pdf", sep="") # AVI: dynamically file name extensions added + ScatterPlotFile <- file.path(snakemake@output[["plot_scatter"]], paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".scatter_plots.pdf", sep="")) # AVI: dynamically file name extensions added pdf(file=ScatterPlotFile,width=10,height=8) # AVI: added to create separate pdf files combs <- combn(comp.samples,2,simplify=FALSE) clabels <- sapply(combs,function(x){paste(x,collapse=' & ')}) From 69180750da7701f28d6d421a54c7b6a98852510f Mon Sep 17 00:00:00 2001 From: AntonieV Date: Sun, 4 Jul 2021 18:10:29 +0200 Subject: [PATCH 10/40] changes on 'find' commands according PR#8 --- workflow/rules/consensus_peak_analysis.smk | 5 +++-- workflow/rules/peak_analysis.smk | 5 +++-- workflow/rules/post-analysis.smk | 9 +++++---- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/workflow/rules/consensus_peak_analysis.smk b/workflow/rules/consensus_peak_analysis.smk index 5cb4ce1..1d76d32 100644 --- a/workflow/rules/consensus_peak_analysis.smk +++ b/workflow/rules/consensus_peak_analysis.smk @@ -89,11 +89,12 @@ rule create_consensus_igv: "results/macs2_merged_expand/{antibody}.consensus_{peak}-peaks.boolean.bed" output: "results/IGV/consensus/merged_library.{antibody}.consensus_{peak}-peaks.igv.txt" + params: + lambda w, input: "\n".join(["{}\t0,0,0".format(path) for path in input]) log: "logs/igv/consensus/merged_library.{antibody}.consensus_{peak}-peaks.igv.log" shell: - "find {input} -type f -name '*.consensus_{wildcards.peak}-peaks.boolean.bed' -exec " - "echo -e '{{}}\t0,0,0' \; > {output} 2> {log}" + "echo -e '{params}' > {output} 2> {log}" rule homer_consensus_annotatepeaks: input: diff --git a/workflow/rules/peak_analysis.smk b/workflow/rules/peak_analysis.smk index 8c110ae..bf81907 100644 --- a/workflow/rules/peak_analysis.smk +++ b/workflow/rules/peak_analysis.smk @@ -177,11 +177,12 @@ rule create_igv_peaks: "results/macs2_callpeak/{sample}-{control}.{peak}_peaks.{peak}Peak" output: "results/IGV/macs2_callpeak-{peak}/merged_library.{sample}-{control}.{peak}_peaks.igv.txt" + params: + lambda w, input: "\n".join(["{}\t0,0,178".format(path) for path in input]) log: "logs/igv/create_igv_peaks/merged_library.{sample}-{control}.{peak}_peaks.log" shell: - " find {input} -type f -name '*_peaks.{wildcards.peak}Peak' -exec " - "echo -e '{{}}\t0,0,178' \; > {output} 2> {log}" + "echo -e '{params}' > {output} 2> {log}" rule homer_annotatepeaks: input: diff --git a/workflow/rules/post-analysis.smk b/workflow/rules/post-analysis.smk index 61fe5cd..673f317 100644 --- a/workflow/rules/post-analysis.smk +++ b/workflow/rules/post-analysis.smk @@ -112,15 +112,16 @@ rule bedGraphToBigWig: rule create_igv_bigwig: input: - "resources/ref/genome.bed", - expand("results/big_wig/{sample}.bigWig", sample=samples.index) + genome="resources/ref/genome.bed", + bigwig=expand("results/big_wig/{sample}.bigWig", sample=samples.index) output: "results/IGV/big_wig/merged_library.bigWig.igv.txt" + params: + lambda w, input: "\n".join(["{}\t0,0,178".format(path) for path in input.bigwig]) log: "logs/igv/create_igv_bigwig.log" shell: - "find {input} -type f -name '*.bigWig' -exec " - "echo -e '{{}}\t0,0,178' \; > {output} 2> {log}" + "echo -e '{params}' > {output} 2> {log}" rule compute_matrix: input: From 0135bd0d19fb8b2e3c795deb26f8c5ef568e281e Mon Sep 17 00:00:00 2001 From: Antonie Vietor <45459714+AntonieV@users.noreply.github.com> Date: Wed, 2 Feb 2022 00:00:43 +0100 Subject: [PATCH 11/40] Update .test/config/config.yaml Co-authored-by: David Laehnemann --- .test/config/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.test/config/config.yaml b/.test/config/config.yaml index b521887..4f3afe0 100644 --- a/.test/config/config.yaml +++ b/.test/config/config.yaml @@ -1,4 +1,4 @@ -# This file should contains everything to configure the workflow on a global scale. +# This file contains everything to configure the workflow on a global scale. # In case of sample based data, it should be complemented by a samples.tsv file that contains # one row per sample. It can be parsed easily via pandas. samples: "config/samples.tsv" From 3521c9da58672bea13ff96551469891c50d55f5e Mon Sep 17 00:00:00 2001 From: Antonie Vietor <45459714+AntonieV@users.noreply.github.com> Date: Wed, 2 Feb 2022 00:01:02 +0100 Subject: [PATCH 12/40] Update .test/config/config.yaml Co-authored-by: David Laehnemann --- .test/config/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.test/config/config.yaml b/.test/config/config.yaml index 4f3afe0..921f575 100644 --- a/.test/config/config.yaml +++ b/.test/config/config.yaml @@ -1,6 +1,6 @@ # This file contains everything to configure the workflow on a global scale. # In case of sample based data, it should be complemented by a samples.tsv file that contains -# one row per sample. It can be parsed easily via pandas. +# one row per sample. It is parsed in common.smk using pandas (https://pandas.pydata.org/). samples: "config/samples.tsv" # to download reads from SRA the accession numbers (see https://www.ncbi.nlm.nih.gov/sra) of samples must be given in # units.tsv dataset for testing this workflow with single end reads: From f1eb6700beb71f0f1f4c17851612e213f58959eb Mon Sep 17 00:00:00 2001 From: Antonie Vietor <45459714+AntonieV@users.noreply.github.com> Date: Wed, 2 Feb 2022 00:01:27 +0100 Subject: [PATCH 13/40] Update .test/config_paired_end/config.yaml Co-authored-by: David Laehnemann --- .test/config_paired_end/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.test/config_paired_end/config.yaml b/.test/config_paired_end/config.yaml index acfecff..c381c34 100644 --- a/.test/config_paired_end/config.yaml +++ b/.test/config_paired_end/config.yaml @@ -1,4 +1,4 @@ -# This file should contains everything to configure the workflow on a global scale. +# This file contains everything to configure the workflow on a global scale. # In case of sample based data, it should be complemented by a samples.tsv file that contains # one row per sample. It can be parsed easily via pandas. samples: "config_paired_end_reduced/samples.tsv" From da60d1c1b00535f562d3dda8fa936291e2561933 Mon Sep 17 00:00:00 2001 From: Antonie Vietor <45459714+AntonieV@users.noreply.github.com> Date: Wed, 2 Feb 2022 00:01:38 +0100 Subject: [PATCH 14/40] Update .test/config_paired_end/config.yaml Co-authored-by: David Laehnemann --- .test/config_paired_end/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.test/config_paired_end/config.yaml b/.test/config_paired_end/config.yaml index c381c34..7170824 100644 --- a/.test/config_paired_end/config.yaml +++ b/.test/config_paired_end/config.yaml @@ -1,6 +1,6 @@ # This file contains everything to configure the workflow on a global scale. # In case of sample based data, it should be complemented by a samples.tsv file that contains -# one row per sample. It can be parsed easily via pandas. +# one row per sample. It is parsed in common.smk using pandas (https://pandas.pydata.org/). samples: "config_paired_end_reduced/samples.tsv" units: "config_paired_end_reduced/units.tsv" single_end: False From e9c43d3817fa76488c85afec4c4809cdc84d5639 Mon Sep 17 00:00:00 2001 From: Antonie Vietor <45459714+AntonieV@users.noreply.github.com> Date: Wed, 2 Feb 2022 00:01:49 +0100 Subject: [PATCH 15/40] Update .test/config_paired_end_reduced/config.yaml Co-authored-by: David Laehnemann --- .test/config_paired_end_reduced/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.test/config_paired_end_reduced/config.yaml b/.test/config_paired_end_reduced/config.yaml index 1e6f08f..275bfda 100644 --- a/.test/config_paired_end_reduced/config.yaml +++ b/.test/config_paired_end_reduced/config.yaml @@ -1,4 +1,4 @@ -# This file should contains everything to configure the workflow on a global scale. +# This file should contain everything to configure the workflow on a global scale. # In case of sample based data, it should be complemented by a samples.tsv file that contains # one row per sample. It can be parsed easily via pandas. samples: "config_paired_end_reduced/samples.tsv" From 5610eb0fb77d4e3808a565fe9b27904ac02a15af Mon Sep 17 00:00:00 2001 From: Antonie Vietor <45459714+AntonieV@users.noreply.github.com> Date: Wed, 2 Feb 2022 00:09:52 +0100 Subject: [PATCH 16/40] Update .test/config_single_end/config.yaml Co-authored-by: David Laehnemann --- .test/config_single_end/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.test/config_single_end/config.yaml b/.test/config_single_end/config.yaml index c376453..7fa0034 100644 --- a/.test/config_single_end/config.yaml +++ b/.test/config_single_end/config.yaml @@ -1,4 +1,4 @@ -# This file should contains everything to configure the workflow on a global scale. +# This file contains everything to configure the workflow on a global scale. # In case of sample based data, it should be complemented by a samples.tsv file that contains # one row per sample. It can be parsed easily via pandas. samples: "config_single_end/samples.tsv" From 88a2df2899757ea849b1f5b41ae8a551199e0590 Mon Sep 17 00:00:00 2001 From: Antonie Vietor <45459714+AntonieV@users.noreply.github.com> Date: Wed, 2 Feb 2022 00:11:22 +0100 Subject: [PATCH 17/40] Update .test/config_single_end/config.yaml Co-authored-by: David Laehnemann --- .test/config_single_end/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.test/config_single_end/config.yaml b/.test/config_single_end/config.yaml index 7fa0034..60d4e03 100644 --- a/.test/config_single_end/config.yaml +++ b/.test/config_single_end/config.yaml @@ -1,6 +1,6 @@ # This file contains everything to configure the workflow on a global scale. # In case of sample based data, it should be complemented by a samples.tsv file that contains -# one row per sample. It can be parsed easily via pandas. +# one row per sample. It is parsed in common.smk using pandas (https://pandas.pydata.org/). samples: "config_single_end/samples.tsv" # to download reads from SRA the accession numbers (see https://www.ncbi.nlm.nih.gov/sra) of samples must be given in # units.tsv dataset for testing this workflow with single end reads: From 01fd686afb0c5dbaae56651a9220c015458721dc Mon Sep 17 00:00:00 2001 From: Antonie Vietor <45459714+AntonieV@users.noreply.github.com> Date: Wed, 2 Feb 2022 00:26:34 +0100 Subject: [PATCH 18/40] Update .test/config_single_end_reduced/config.yaml Co-authored-by: David Laehnemann --- .test/config_single_end_reduced/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.test/config_single_end_reduced/config.yaml b/.test/config_single_end_reduced/config.yaml index c7420b2..c0076f9 100644 --- a/.test/config_single_end_reduced/config.yaml +++ b/.test/config_single_end_reduced/config.yaml @@ -1,4 +1,4 @@ -# This file should contains everything to configure the workflow on a global scale. +# This file contains everything to configure the workflow on a global scale. # In case of sample based data, it should be complemented by a samples.tsv file that contains # one row per sample. It can be parsed easily via pandas. samples: "config_single_end_reduced/samples.tsv" From 0b10a423db73ef08e024c9785eec77d78fe8ee4b Mon Sep 17 00:00:00 2001 From: Antonie Vietor <45459714+AntonieV@users.noreply.github.com> Date: Wed, 2 Feb 2022 00:26:59 +0100 Subject: [PATCH 19/40] Update .test/config_single_end_reduced/config.yaml Co-authored-by: David Laehnemann --- .test/config_single_end_reduced/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.test/config_single_end_reduced/config.yaml b/.test/config_single_end_reduced/config.yaml index c0076f9..18c9380 100644 --- a/.test/config_single_end_reduced/config.yaml +++ b/.test/config_single_end_reduced/config.yaml @@ -1,6 +1,6 @@ # This file contains everything to configure the workflow on a global scale. # In case of sample based data, it should be complemented by a samples.tsv file that contains -# one row per sample. It can be parsed easily via pandas. +# one row per sample. It is parsed in common.smk using pandas (https://pandas.pydata.org/). samples: "config_single_end_reduced/samples.tsv" # to download reads from SRA the accession numbers (see https://www.ncbi.nlm.nih.gov/sra) of samples must be given in # units.tsv dataset for testing this workflow with single end reads: From 50ff493ce93fc7ad7102121965236a7ce1a68a3e Mon Sep 17 00:00:00 2001 From: Antonie Vietor <45459714+AntonieV@users.noreply.github.com> Date: Wed, 2 Feb 2022 00:27:20 +0100 Subject: [PATCH 20/40] Update workflow/report/plot_consensus_peak_intersect.rst Co-authored-by: David Laehnemann --- workflow/report/plot_consensus_peak_intersect.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/report/plot_consensus_peak_intersect.rst b/workflow/report/plot_consensus_peak_intersect.rst index c40e70e..3fdb52b 100644 --- a/workflow/report/plot_consensus_peak_intersect.rst +++ b/workflow/report/plot_consensus_peak_intersect.rst @@ -1,2 +1,2 @@ -**MACS2 and bedtools** merged consensus {{ snakemake.wildcards.peak }} peaks plot is generated by calculating the -proportion of intersection size assigned to all samples for {{ snakemake.wildcards.antibody }}. +**MACS2** consensus peaks plot for {{ snakemake.wildcards.peak }} peaks is an `UpSet plot `_ that shows +which sample subsets for the antibody {{ snakemake.wildcards.antibody }} share how many peaks. From 44f4f1756e2fa84eb6143b14355343b0ff7827de Mon Sep 17 00:00:00 2001 From: Antonie Vietor <45459714+AntonieV@users.noreply.github.com> Date: Wed, 2 Feb 2022 00:27:37 +0100 Subject: [PATCH 21/40] Update config/config.yaml Co-authored-by: David Laehnemann --- config/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/config.yaml b/config/config.yaml index 8e840fb..b4a0681 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -1,6 +1,6 @@ # This file contains everything to configure the workflow on a global scale. # The sample based data must be complemented by a samples.tsv file that contains -# one row per sample. It can be parsed easily via pandas. +# one row per sample. It is parsed in common.smk using pandas (https://pandas.pydata.org/). samples: "config/samples.tsv" # The source of fastq files for every sequencing unit of all samples has to be provided in the units.tsv file. units: "config/units.tsv" From ee2c09d0810d42d6990abb083b0ca9002c79d5a6 Mon Sep 17 00:00:00 2001 From: Antonie Vietor <45459714+AntonieV@users.noreply.github.com> Date: Wed, 2 Feb 2022 19:07:10 +0100 Subject: [PATCH 22/40] Update config.yaml --- .test/config_paired_end_reduced/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.test/config_paired_end_reduced/config.yaml b/.test/config_paired_end_reduced/config.yaml index 275bfda..e8dde9a 100644 --- a/.test/config_paired_end_reduced/config.yaml +++ b/.test/config_paired_end_reduced/config.yaml @@ -1,6 +1,6 @@ # This file should contain everything to configure the workflow on a global scale. # In case of sample based data, it should be complemented by a samples.tsv file that contains -# one row per sample. It can be parsed easily via pandas. +# one row per sample. It is parsed in common.smk using pandas (https://pandas.pydata.org/). samples: "config_paired_end_reduced/samples.tsv" units: "config_paired_end_reduced/units.tsv" single_end: False From 84e476c02a84ddc2091160bab6389fc03477e029 Mon Sep 17 00:00:00 2001 From: Antonie Vietor <45459714+AntonieV@users.noreply.github.com> Date: Wed, 2 Feb 2022 19:08:29 +0100 Subject: [PATCH 23/40] Update workflow/report/plot_deseq2_FDR_1_perc_MA.rst Co-authored-by: David Laehnemann --- workflow/report/plot_deseq2_FDR_1_perc_MA.rst | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/workflow/report/plot_deseq2_FDR_1_perc_MA.rst b/workflow/report/plot_deseq2_FDR_1_perc_MA.rst index 80cba6a..6271191 100644 --- a/workflow/report/plot_deseq2_FDR_1_perc_MA.rst +++ b/workflow/report/plot_deseq2_FDR_1_perc_MA.rst @@ -1,7 +1,9 @@ The `MA plot `_ **(FDR 0.01)** -displays the log2 fold changes versus the mean of normalized counts of the -DESeq2 analysis results. -The results of this plot are filtered on a false discovery rate (FDR) threshold of 0.01 and represent the comparison of the -{{snakemake.wildcards["group_1"]}} versus {{snakemake.wildcards["group_2"]}} groups treated with the -{{snakemake.wildcards["antibody"]}} antibody. For more information about DESeq2 please see +displays the effect size (log2 fold changes) of differences in feature counts +between the group {{snakemake.wildcards["group_1"]}} +and the group {{snakemake.wildcards["group_2"]}} +(both treated with the {{snakemake.wildcards["antibody"]}} antibody) +as determined by DESeq2 (based on the mean of normalized counts). +The results of this plot have been filtered for a false discovery rate (FDR) of ``0.01``. +For more information about DESeq2 please see the `documentation `_. From 95f6bbd0920068a349e9c3edf746530764fe5f02 Mon Sep 17 00:00:00 2001 From: Antonie Vietor <45459714+AntonieV@users.noreply.github.com> Date: Wed, 2 Feb 2022 19:09:04 +0100 Subject: [PATCH 24/40] Update workflow/report/plot_deseq2_FDR_1_perc_volcano.rst Co-authored-by: David Laehnemann --- workflow/report/plot_deseq2_FDR_1_perc_volcano.rst | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/workflow/report/plot_deseq2_FDR_1_perc_volcano.rst b/workflow/report/plot_deseq2_FDR_1_perc_volcano.rst index beb1c59..c0f7e4d 100644 --- a/workflow/report/plot_deseq2_FDR_1_perc_volcano.rst +++ b/workflow/report/plot_deseq2_FDR_1_perc_volcano.rst @@ -1,8 +1,10 @@ -**Volcano plot (FDR 0.01)** shows the significance (adjusted p-value) versus the log2 fold changes of the -DESeq2 analysis results. -The results of this plot are filtered on a false discovery rate (FDR) threshold of 0.01 and represent the comparison of the -{{snakemake.wildcards["group_1"]}} versus {{snakemake.wildcards["group_2"]}} groups for the -{{snakemake.wildcards["antibody"]}} antibody. For more information about DESeq2 please see +The **Volcano plot (FDR 0.01)** shows the significance (adjusted p-value) versus the effect size (log2 fold changes) +differences in feature counts between the group {{snakemake.wildcards["group_1"]}} +and the group {{snakemake.wildcards["group_2"]}} group +(both treated with the {{snakemake.wildcards["antibody"]}} antibody) +as determined by DESeq2 (based on the mean of normalized counts). +The results of this plot have been filtered for a false discovery rate (FDR) of ``0.01``. +For more information about DESeq2 please see the `documentation `_. From b7413b17d3d2396c4ec2b088eebbd410f81a6187 Mon Sep 17 00:00:00 2001 From: Antonie Vietor <45459714+AntonieV@users.noreply.github.com> Date: Wed, 2 Feb 2022 19:09:35 +0100 Subject: [PATCH 25/40] Update workflow/report/plot_deseq2_FDR_5_perc_MA.rst Co-authored-by: David Laehnemann --- workflow/report/plot_deseq2_FDR_5_perc_MA.rst | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/workflow/report/plot_deseq2_FDR_5_perc_MA.rst b/workflow/report/plot_deseq2_FDR_5_perc_MA.rst index c966585..6a85b50 100644 --- a/workflow/report/plot_deseq2_FDR_5_perc_MA.rst +++ b/workflow/report/plot_deseq2_FDR_5_perc_MA.rst @@ -1,8 +1,10 @@ The `MA plot `_ **(FDR 0.05)** -displays the log2 fold changes versus the mean of normalized counts of the -DESeq2 analysis results. -The results of this plot are filtered on a false discovery rate (FDR) threshold of 0.05 and represent the comparison of the -{{snakemake.wildcards["group_1"]}} versus {{snakemake.wildcards["group_2"]}} groups for the -{{snakemake.wildcards["antibody"]}} antibody. For more information about DESeq2 please see +displays the effect size (log2 fold changes) of differences in feature counts +between the group {{snakemake.wildcards["group_1"]}} +and the group {{snakemake.wildcards["group_2"]}} +(both treated with the {{snakemake.wildcards["antibody"]}} antibody) +as determined by DESeq2 (based on the mean of normalized counts). +The results of this plot have been filtered for a false discovery rate (FDR) of ``0.05``. +For more information about DESeq2 please see the `documentation `_. From 604c90fe4c36ece8448d13e30ac8125e91397219 Mon Sep 17 00:00:00 2001 From: Antonie Vietor <45459714+AntonieV@users.noreply.github.com> Date: Wed, 2 Feb 2022 19:09:53 +0100 Subject: [PATCH 26/40] Update workflow/report/plot_deseq2_FDR_5_perc_volcano.rst Co-authored-by: David Laehnemann --- workflow/report/plot_deseq2_FDR_5_perc_volcano.rst | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/workflow/report/plot_deseq2_FDR_5_perc_volcano.rst b/workflow/report/plot_deseq2_FDR_5_perc_volcano.rst index c60a082..4329bfa 100644 --- a/workflow/report/plot_deseq2_FDR_5_perc_volcano.rst +++ b/workflow/report/plot_deseq2_FDR_5_perc_volcano.rst @@ -1,6 +1,8 @@ -**Volcano plot (FDR 0.05)** shows the significance (adjusted p-value) versus the log2 fold changes of the -DESeq2 analysis results. -The results of this plot are filtered on a false discovery rate (FDR) threshold of 0.05 and represent the comparison of the -{{snakemake.wildcards["group_1"]}} versus {{snakemake.wildcards["group_2"]}} groups for the -{{snakemake.wildcards["antibody"]}} antibody. For more information about DESeq2 please see +The **Volcano plot (FDR 0.01)** shows the significance (adjusted p-value) versus the effect size (log2 fold changes) +differences in feature counts between the group {{snakemake.wildcards["group_1"]}} +and the group {{snakemake.wildcards["group_2"]}} group +(both treated with the {{snakemake.wildcards["antibody"]}} antibody) +as determined by DESeq2 (based on the mean of normalized counts). +The results of this plot have been filtered for a false discovery rate (FDR) of ``0.05``. +For more information about DESeq2 please see the `documentation `_. From 8c74df26920acd58dfec1062ef284101c1190b9f Mon Sep 17 00:00:00 2001 From: Antonie Vietor <45459714+AntonieV@users.noreply.github.com> Date: Wed, 2 Feb 2022 19:10:52 +0100 Subject: [PATCH 27/40] Update workflow/report/plot_deseq2_heatmap.rst Co-authored-by: David Laehnemann --- workflow/report/plot_deseq2_heatmap.rst | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/workflow/report/plot_deseq2_heatmap.rst b/workflow/report/plot_deseq2_heatmap.rst index 0ed11a1..732e018 100644 --- a/workflow/report/plot_deseq2_heatmap.rst +++ b/workflow/report/plot_deseq2_heatmap.rst @@ -1,6 +1,7 @@ -`Heatmap plot `_ shows a heatmap of the `rlog `_ or -`vst `_ transformed counts from -DESeq2 analysis for all groups treated with the {{snakemake.wildcards["antibody"]}} antibody -. For more information about DESeq2 please see +`Heatmap `_ of the +{% if snakemake.params.vst == "true" %}vst{% else %}rlog{% endif %} transformed feature counts from +DESeq2 for all groups treated with the {{snakemake.wildcards["antibody"]}} antibody. +There is more detailed `information on the count transformation in the docs `_. +For more information about DESeq2 in general, please also see the `documentation `_. From ae39a3199ba327e1dffd3a1dd369e673001a994a Mon Sep 17 00:00:00 2001 From: Antonie Vietor <45459714+AntonieV@users.noreply.github.com> Date: Wed, 2 Feb 2022 22:12:28 +0100 Subject: [PATCH 28/40] Update workflow/scripts/featurecounts_deseq2.R Co-authored-by: David Laehnemann --- workflow/scripts/featurecounts_deseq2.R | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/workflow/scripts/featurecounts_deseq2.R b/workflow/scripts/featurecounts_deseq2.R index 1f96ad7..fee5011 100644 --- a/workflow/scripts/featurecounts_deseq2.R +++ b/workflow/scripts/featurecounts_deseq2.R @@ -286,10 +286,18 @@ if (file.exists(ResultsFile) == FALSE) { ## WRITE RESULTS FILE if (MIN_FDR == 0.01) { # AVI: dynamically generated file name extensions added - dirs <- c(snakemake@output[["FDR_1_perc_res"]],snakemake@output[["FDR_1_perc_bed"]],snakemake@output[["plot_FDR_1_perc_MA"]], - snakemake@output[["plot_FDR_1_perc_volcano"]],snakemake@output[["FDR_5_perc_res"]],snakemake@output[["FDR_5_perc_bed"]], - snakemake@output[["plot_FDR_5_perc_MA"]],snakemake@output[["plot_FDR_5_perc_volcano"]],snakemake@output[["plot_sample_corr_heatmap"]], - snakemake@output[["plot_scatter"]]) + dirs <- c( + snakemake@output[["FDR_1_perc_res"]], + snakemake@output[["FDR_1_perc_bed"]], + snakemake@output[["plot_FDR_1_perc_MA"]], + snakemake@output[["plot_FDR_1_perc_volcano"]], + snakemake@output[["FDR_5_perc_res"]], + snakemake@output[["FDR_5_perc_bed"]], + snakemake@output[["plot_FDR_5_perc_MA"]], + snakemake@output[["plot_FDR_5_perc_volcano"]], + snakemake@output[["plot_sample_corr_heatmap"]], + snakemake@output[["plot_scatter"]] + ) for (dir in dirs) { if (file.exists(dir) == FALSE) { dir.create(dir,recursive=TRUE) From e4865b3b184c113263c99438e9ae6f17c48888d8 Mon Sep 17 00:00:00 2001 From: Antonie Vietor <45459714+AntonieV@users.noreply.github.com> Date: Wed, 2 Feb 2022 22:12:49 +0100 Subject: [PATCH 29/40] Update workflow/scripts/featurecounts_deseq2.R Co-authored-by: David Laehnemann --- workflow/scripts/featurecounts_deseq2.R | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/workflow/scripts/featurecounts_deseq2.R b/workflow/scripts/featurecounts_deseq2.R index fee5011..223b5ac 100644 --- a/workflow/scripts/featurecounts_deseq2.R +++ b/workflow/scripts/featurecounts_deseq2.R @@ -304,10 +304,22 @@ if (file.exists(ResultsFile) == FALSE) { } } - CompResultsFile <- file.path(snakemake@output[["FDR_1_perc_res"]], paste(snakemake@params[["antibody"]], ".", CompPrefix, ".FDR_0.01_results.txt", sep="")) - CompBEDFile <- file.path(snakemake@output[["FDR_1_perc_bed"]], paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".FDR_0.01_results.bed", sep="")) - MAplotFile <- file.path(snakemake@output[["plot_FDR_1_perc_MA"]], paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".MA-plot_FDR_0.01.pdf", sep="")) # AVI: added to create separate pdf files - VolcanoPlotFile <- file.path(snakemake@output[["plot_FDR_1_perc_volcano"]], paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".volcano_FDR_0.01.pdf", sep="")) # AVI: added to create separate pdf files + CompResultsFile <- file.path( + snakemake@output[["FDR_1_perc_res"]], + paste(snakemake@params[["antibody"]], ".", CompPrefix, ".FDR_0.01_results.txt", sep="") + ) + CompBEDFile <- file.path( + snakemake@output[["FDR_1_perc_bed"]], + paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".FDR_0.01_results.bed", sep="") + ) + MAplotFile <- file.path( + snakemake@output[["plot_FDR_1_perc_MA"]], + paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".MA-plot_FDR_0.01.pdf", sep="") + ) # AVI: added to create separate pdf files + VolcanoPlotFile <- file.path( + snakemake@output[["plot_FDR_1_perc_volcano"]], + paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".volcano_FDR_0.01.pdf", sep="") + ) # AVI: added to create separate pdf files } if (MIN_FDR == 0.05) { # AVI: dynamically file name extensions added From 422f0ccf58600e1855adb7c6d59b89027eb86df6 Mon Sep 17 00:00:00 2001 From: Antonie Vietor <45459714+AntonieV@users.noreply.github.com> Date: Wed, 2 Feb 2022 22:13:01 +0100 Subject: [PATCH 30/40] Update workflow/scripts/featurecounts_deseq2.R Co-authored-by: David Laehnemann --- workflow/scripts/featurecounts_deseq2.R | 27 +++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/workflow/scripts/featurecounts_deseq2.R b/workflow/scripts/featurecounts_deseq2.R index 223b5ac..4527af7 100644 --- a/workflow/scripts/featurecounts_deseq2.R +++ b/workflow/scripts/featurecounts_deseq2.R @@ -322,13 +322,28 @@ if (file.exists(ResultsFile) == FALSE) { ) # AVI: added to create separate pdf files } if (MIN_FDR == 0.05) { - # AVI: dynamically file name extensions added - CompResultsFile <- file.path(snakemake@output[["FDR_5_perc_res"]], paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".FDR_0.05_results.txt", sep="")) - CompBEDFile <- file.path(snakemake@output[["FDR_5_perc_bed"]], paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".FDR_0.05_results.bed", sep="")) + # AVI: dynamically create file names + CompResultsFile <- file.path( + snakemake@output[["FDR_5_perc_res"]], paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".FDR_0.05_results.txt", sep="") + ) + CompBEDFile <- file.path( + snakemake@output[["FDR_5_perc_bed"]], + paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".FDR_0.05_results.bed", sep="") + ) # AVI: write paths of FDR 0.05 bed files to igv file - cat(paste0(CompBEDFile, "\t255,0,0\n"),file=snakemake@output[["igv_FDR_5_bed"]],append=TRUE) - MAplotFile <- file.path(snakemake@output[["plot_FDR_5_perc_MA"]], paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".MA-plot_FDR_0.05.pdf", sep="")) # AVI: added to create separate pdf files - VolcanoPlotFile <- file.path(snakemake@output[["plot_FDR_5_perc_volcano"]], paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".volcano_FDR_0.05.pdf", sep="")) # AVI: added to create separate pdf files + cat( + paste0(CompBEDFile, "\t255,0,0\n"), + file=snakemake@output[["igv_FDR_5_bed"]], + append=TRUE + ) + MAplotFile <- file.path( + snakemake@output[["plot_FDR_5_perc_MA"]], + paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".MA-plot_FDR_0.05.pdf", sep="") + ) # AVI: added to create separate pdf files + VolcanoPlotFile <- file.path( + snakemake@output[["plot_FDR_5_perc_volcano"]], + paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".volcano_FDR_0.05.pdf", sep="") + ) # AVI: added to create separate pdf files } write.table(pass.fdr.table, file=CompResultsFile, col.names=TRUE, row.names=FALSE, sep='\t', quote=FALSE) From 799f0d92ca7318e87164b2fd7f78548b2f4fdb42 Mon Sep 17 00:00:00 2001 From: Antonie Vietor <45459714+AntonieV@users.noreply.github.com> Date: Wed, 2 Feb 2022 22:13:16 +0100 Subject: [PATCH 31/40] Update workflow/scripts/featurecounts_deseq2.R Co-authored-by: David Laehnemann --- workflow/scripts/featurecounts_deseq2.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/workflow/scripts/featurecounts_deseq2.R b/workflow/scripts/featurecounts_deseq2.R index 4527af7..b4120b0 100644 --- a/workflow/scripts/featurecounts_deseq2.R +++ b/workflow/scripts/featurecounts_deseq2.R @@ -370,7 +370,10 @@ if (file.exists(ResultsFile) == FALSE) { } ## SAMPLE CORRELATION HEATMAP - CorrHeatmapFile <- file.path(snakemake@output[["plot_sample_corr_heatmap"]], paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".correlation_heatmap.pdf", sep="")) # AVI: dynamically file name extensions added + CorrHeatmapFile <- file.path( + snakemake@output[["plot_sample_corr_heatmap"]], + paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".correlation_heatmap.pdf", sep="") + ) # AVI: dynamically create file names pdf(file=CorrHeatmapFile,width=10,height=8) # AVI: added to create separate pdf files rld.subset <- assay(rld)[,comp.samples] sampleDists <- dist(t(rld.subset)) From 0e11059e209f21f4672ecf95bbec876981b4af9e Mon Sep 17 00:00:00 2001 From: Antonie Vietor <45459714+AntonieV@users.noreply.github.com> Date: Wed, 2 Feb 2022 22:13:29 +0100 Subject: [PATCH 32/40] Update workflow/scripts/featurecounts_deseq2.R Co-authored-by: David Laehnemann --- workflow/scripts/featurecounts_deseq2.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/workflow/scripts/featurecounts_deseq2.R b/workflow/scripts/featurecounts_deseq2.R index b4120b0..dd4e55d 100644 --- a/workflow/scripts/featurecounts_deseq2.R +++ b/workflow/scripts/featurecounts_deseq2.R @@ -383,7 +383,10 @@ if (file.exists(ResultsFile) == FALSE) { dev.off() # AVI: added to create separate pdf files ## SCATTER PLOT FOR RLOG COUNTS - ScatterPlotFile <- file.path(snakemake@output[["plot_scatter"]], paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".scatter_plots.pdf", sep="")) # AVI: dynamically file name extensions added + ScatterPlotFile <- file.path( + snakemake@output[["plot_scatter"]], + paste(snakemake@wildcards[["antibody"]], ".", CompPrefix, ".scatter_plots.pdf", sep="") + ) # AVI: dynamically create file names pdf(file=ScatterPlotFile,width=10,height=8) # AVI: added to create separate pdf files combs <- combn(comp.samples,2,simplify=FALSE) clabels <- sapply(combs,function(x){paste(x,collapse=' & ')}) From 15f3e949a24b6d0975ccb89c5047c26c8a55d136 Mon Sep 17 00:00:00 2001 From: Antonie Vietor <45459714+AntonieV@users.noreply.github.com> Date: Wed, 2 Feb 2022 22:21:24 +0100 Subject: [PATCH 33/40] Update workflow/report/plot_deseq2_pca.rst Co-authored-by: David Laehnemann --- workflow/report/plot_deseq2_pca.rst | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/workflow/report/plot_deseq2_pca.rst b/workflow/report/plot_deseq2_pca.rst index bce9d04..e75d5d4 100644 --- a/workflow/report/plot_deseq2_pca.rst +++ b/workflow/report/plot_deseq2_pca.rst @@ -1,6 +1,7 @@ `PCA plot `_ -**(Principal Component Analysis)** describes variance of the -`rlog `_ or -`vst `_ transformed counts from -DESeq2 analysis with regard to the used {{snakemake.wildcards["antibody"]}} antibody. For more information about DESeq2 please see +**(Principal Component Analysis)** show the variance of the +{% if snakemake.params.vst == "true" %}vst{% else %}rlog{% endif %} transformed feature counts +from DESeq2 with regard to the used {{snakemake.wildcards["antibody"]}} antibody. +There is more detailed `information on the count transformation in the docs `_. +For more information about DESeq2 in general, please also see the `documentation `_. From 992aa586d758df133c69423370556eae9ee66f80 Mon Sep 17 00:00:00 2001 From: Antonie Vietor <45459714+AntonieV@users.noreply.github.com> Date: Wed, 2 Feb 2022 22:22:01 +0100 Subject: [PATCH 34/40] Update workflow/report/plot_deseq2_sample_corr_heatmap.rst Co-authored-by: David Laehnemann --- .../report/plot_deseq2_sample_corr_heatmap.rst | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/workflow/report/plot_deseq2_sample_corr_heatmap.rst b/workflow/report/plot_deseq2_sample_corr_heatmap.rst index 1061e70..9d0c283 100644 --- a/workflow/report/plot_deseq2_sample_corr_heatmap.rst +++ b/workflow/report/plot_deseq2_sample_corr_heatmap.rst @@ -1,7 +1,8 @@ -Correlation `heatmap plots `_ shows heatmaps of the -`rlog `_ or -`vst `_ transformed counts from -the DESeq2 analysis. The results of this plot represent the comparison of the -{{snakemake.wildcards["group_1"]}} versus {{snakemake.wildcards["group_2"]}} groups treated with the -{{snakemake.wildcards["antibody"]}} antibody. For more information about DESeq2 please -see `documentation `_. +`Correlation heatmap `_ of the +{% if snakemake.params.vst == "true" %}vst{% else %}rlog{% endif %} transformed feature counts +from DESeq2, comparing the {{snakemake.wildcards["group_1"]}} group with +the {{snakemake.wildcards["group_2"]}} group +(both treated with the {{snakemake.wildcards["antibody"]}} antibody). +There is more detailed `information on the count transformation in the docs `_. +For more information about DESeq2 in general, please also see the +`documentation `_. From e6f8e4a55c3784772ef0291d0f70b11ae7ac3b1b Mon Sep 17 00:00:00 2001 From: Antonie Vietor <45459714+AntonieV@users.noreply.github.com> Date: Wed, 2 Feb 2022 22:22:37 +0100 Subject: [PATCH 35/40] Update workflow/report/plot_deseq2_scatter.rst Co-authored-by: David Laehnemann --- workflow/report/plot_deseq2_scatter.rst | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/workflow/report/plot_deseq2_scatter.rst b/workflow/report/plot_deseq2_scatter.rst index 74c9f90..e244fef 100644 --- a/workflow/report/plot_deseq2_scatter.rst +++ b/workflow/report/plot_deseq2_scatter.rst @@ -1,8 +1,9 @@ -**Scatter plots** uses the matrix of the -`rlog `_ or -`vst `_ transformed counts from -the DESeq2 analysis. The results of this plot represent the comparison of the -{{snakemake.wildcards["group_1"]}} versus {{snakemake.wildcards["group_2"]}} groups treated with the -{{snakemake.wildcards["antibody"]}} antibody. For more information about DESeq2 please -see `documentation `_. +This **scatter plot** uses the matrix of the +{% if snakemake.params.vst == "true" %}vst{% else %}rlog{% endif %} transformed feature counts +from DESeq2, comparing the {{snakemake.wildcards["group_1"]}} group with +the {{snakemake.wildcards["group_2"]}} group +(both treated with the {{snakemake.wildcards["antibody"]}} antibody). +There is more detailed `information on the count transformation in the docs `_. +For more information about DESeq2 in general, please also see the +`documentation `_. From 131ef70c2136c3e802bbc642d4890cf7a7bfa5b1 Mon Sep 17 00:00:00 2001 From: Antonie Vietor <45459714+AntonieV@users.noreply.github.com> Date: Wed, 2 Feb 2022 22:23:24 +0100 Subject: [PATCH 36/40] Update workflow/report/plot_heatmap_deeptools.rst Co-authored-by: David Laehnemann --- workflow/report/plot_heatmap_deeptools.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/report/plot_heatmap_deeptools.rst b/workflow/report/plot_heatmap_deeptools.rst index 1c94b2a..b52cb52 100644 --- a/workflow/report/plot_heatmap_deeptools.rst +++ b/workflow/report/plot_heatmap_deeptools.rst @@ -1,2 +1,2 @@ -**deepTools** `plotHeatmap `_ creates a -heatmap of read coverage over sets of genomic regions and gives a visualisation for the genome-wide enrichment of the samples. +`**deepTools heatmap** `_ +of read coverage over sets of genomic regions, a visualisation for the genome-wide enrichment of the samples. From f8d1350afa05a4a6a44499a924102a68b1fd6785 Mon Sep 17 00:00:00 2001 From: Antonie Vietor <45459714+AntonieV@users.noreply.github.com> Date: Wed, 2 Feb 2022 22:23:42 +0100 Subject: [PATCH 37/40] Update workflow/report/plot_macs2_qc.rst Co-authored-by: David Laehnemann --- workflow/report/plot_macs2_qc.rst | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/workflow/report/plot_macs2_qc.rst b/workflow/report/plot_macs2_qc.rst index 73dba23..7ae5895 100644 --- a/workflow/report/plot_macs2_qc.rst +++ b/workflow/report/plot_macs2_qc.rst @@ -1,7 +1,9 @@ -`MACS2 `_ **callpeak quality control plots** show for all -samples and their associated controls the number of peaks and their distributions of peak length, the -`fold-change `_ of -the enrichment for their peak summit against random Poisson distribution, FDR and p-value from the results of the +These `MACS2 `_ +**callpeak quality control plots** show the following metrics for all samples and their associated controls: +the number of peaks and their distributions of peak length, +the `fold-change `_ +of the enrichment for their peak summit against random Poisson distribution, +FDR and p-value from the results of the `MACS callpeak analysis `_. -For more information about Model-based Analysis of ChIP-Seq (MACS) please see published -`article `_. +For more information about Model-based Analysis of ChIP-Seq (MACS) please see the +`MACS article `_. From 62ea61f812b90cd9c7db9b0b1bcb166f23b9809f Mon Sep 17 00:00:00 2001 From: Antonie Vietor <45459714+AntonieV@users.noreply.github.com> Date: Wed, 2 Feb 2022 22:25:10 +0100 Subject: [PATCH 38/40] Update workflow/report/plot_profile_deeptools.rst Co-authored-by: David Laehnemann --- workflow/report/plot_profile_deeptools.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/workflow/report/plot_profile_deeptools.rst b/workflow/report/plot_profile_deeptools.rst index a88643a..7bc5fa1 100644 --- a/workflow/report/plot_profile_deeptools.rst +++ b/workflow/report/plot_profile_deeptools.rst @@ -1,3 +1,3 @@ -**deepTools** `plotProfile `_ creates a -profile plot of read coverage over sets of genomic regions and gives a visualisation for the genome-wide enrichment of the -samples. +**deepTools** `profile plots `_ +show the read coverage over sets of genomic regions and give a visualisation for the genome-wide +enrichment of the samples. From 9d5661819da9d2e28e856e95243cfe4873ce3bc8 Mon Sep 17 00:00:00 2001 From: Antonie Vietor <45459714+AntonieV@users.noreply.github.com> Date: Fri, 4 Feb 2022 00:47:36 +0100 Subject: [PATCH 39/40] Changes according PR #8 --- workflow/rules/consensus_peak_analysis.smk | 4 ---- 1 file changed, 4 deletions(-) diff --git a/workflow/rules/consensus_peak_analysis.smk b/workflow/rules/consensus_peak_analysis.smk index 1d76d32..e1de340 100644 --- a/workflow/rules/consensus_peak_analysis.smk +++ b/workflow/rules/consensus_peak_analysis.smk @@ -1,7 +1,3 @@ -import os - -from snakemake import rules - rule bedtools_merge_broad: input: get_macs2_peaks() From e80da69bf54ce9f85459979062a382052ed95f60 Mon Sep 17 00:00:00 2001 From: Antonie Vietor <45459714+AntonieV@users.noreply.github.com> Date: Fri, 4 Feb 2022 00:49:49 +0100 Subject: [PATCH 40/40] Changes according PR #8 --- workflow/rules/post-analysis.smk | 1 - 1 file changed, 1 deletion(-) diff --git a/workflow/rules/post-analysis.smk b/workflow/rules/post-analysis.smk index 673f317..ff5d6cb 100644 --- a/workflow/rules/post-analysis.smk +++ b/workflow/rules/post-analysis.smk @@ -112,7 +112,6 @@ rule bedGraphToBigWig: rule create_igv_bigwig: input: - genome="resources/ref/genome.bed", bigwig=expand("results/big_wig/{sample}.bigWig", sample=samples.index) output: "results/IGV/big_wig/merged_library.bigWig.igv.txt"