From 62819e1e73657bbbd6c745f4c33418930b642d30 Mon Sep 17 00:00:00 2001 From: Jacques Dainat Date: Wed, 2 Apr 2025 21:11:39 +0200 Subject: [PATCH 01/12] re-organize with modules / remove workflow and call properly twice the same process / improve reproducibility by using software version configuration / set profile to use either singularity/docker/conda / re-work on README / add test profile / add local and hpc ressource config file --- MobiCT.nf | 495 +---------------------- README.md | 130 +++++- config/ressources/conda/bcftools.yaml | 7 + config/ressources/conda/bwa.yaml | 7 + config/ressources/conda/ensembl-vep.yaml | 7 + config/ressources/conda/fastp.yaml | 7 + config/ressources/conda/fgbio.yaml | 7 + config/ressources/conda/gatk4.yaml | 7 + config/ressources/conda/multiqc.yaml | 7 + config/ressources/conda/picard.yaml | 7 + config/ressources/conda/samtools.yaml | 7 + config/ressources/conda/vardict.yaml | 7 + config/ressources/hpc.config | 92 +++++ config/ressources/local.config | 48 +++ config/softwares.config | 52 +++ modules/bcftools.nf | 17 + modules/bwamem.nf | 24 ++ modules/fastp.nf | 29 ++ modules/fgbio.nf | 76 ++++ modules/gatk.nf | 133 ++++++ modules/multiqc.nf | 34 ++ modules/picard.nf | 43 ++ modules/samtools.nf | 19 + modules/vardict.nf | 26 ++ modules/vep.nf | 32 ++ nextflow.config | 92 ++++- 26 files changed, 914 insertions(+), 498 deletions(-) create mode 100644 config/ressources/conda/bcftools.yaml create mode 100644 config/ressources/conda/bwa.yaml create mode 100644 config/ressources/conda/ensembl-vep.yaml create mode 100644 config/ressources/conda/fastp.yaml create mode 100644 config/ressources/conda/fgbio.yaml create mode 100644 config/ressources/conda/gatk4.yaml create mode 100644 config/ressources/conda/multiqc.yaml create mode 100644 config/ressources/conda/picard.yaml create mode 100644 config/ressources/conda/samtools.yaml create mode 100644 config/ressources/conda/vardict.yaml create mode 100644 config/ressources/hpc.config create mode 100644 config/ressources/local.config create mode 100644 config/softwares.config create mode 100644 modules/bcftools.nf create mode 100644 modules/bwamem.nf create mode 100644 modules/fastp.nf create mode 100644 modules/fgbio.nf create mode 100644 modules/gatk.nf create mode 100644 modules/multiqc.nf create mode 100644 modules/picard.nf create mode 100644 modules/samtools.nf create mode 100644 modules/vardict.nf create mode 100644 modules/vep.nf diff --git a/MobiCT.nf b/MobiCT.nf index 338c373..259a139 100644 --- a/MobiCT.nf +++ b/MobiCT.nf @@ -23,479 +23,20 @@ email = 's-cabelloaguilar@chu-montpellier.fr' ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -// PROCESSES - -// Convert the demultiplexed, raw sequencing FASTQ files to BAM -process ConvertFastqToSam { - tag "$sample_id" - - input: - tuple val('sample_id'), path(fastq) - val extension - - output: - tuple val(sample_id), file("${sample_id}${extension}.bam") - - """ - gatk FastqToSam \ - --FASTQ ${fastq[0]} \ - --FASTQ2 ${fastq[1]} \ - --OUTPUT ${sample_id}${extension}.bam \ - --SAMPLE_NAME ${sample_id} \ - --TMP_DIR ${params.tmp_dir} - """ -} - -// Extraction of UMIs from the insert reads -process ExtractUmis { - tag "$sample_id" - - input: - tuple val(sample_id), path(bam_file) - val struct_r1 - val struct_r2 - val extension - - output: - tuple val(sample_id), file("${sample_id}${extension}.bam") - - """ - fgbio ExtractUmisFromBam \ - -i ${bam_file} \ - -o ${sample_id}${extension}.bam \ - -r ${struct_r1} ${struct_r2} \ - -t RX \ - -a true - """ -} - -// Convert the BAM file with UMI extracted reads to a FASTQ file -process ConvertSamToFastq { - tag "$sample_id" - - input: - tuple val(sample_id), path(bam_file) - val extension - - output: - tuple val(sample_id), file("${sample_id}${extension}.R[1,2].fq") - - """ - gatk SamToFastq \ - -I ${bam_file} \ - -F ${sample_id}${extension}.R1.fq \ - -F2 ${sample_id}${extension}.R2.fq \ - --CLIPPING_ATTRIBUTE XT \ - --CLIPPING_ACTION 2 \ - --MAX_RECORDS_IN_RAM 50000000 - """ -} - -// Adapter and quality trimming -process Fastp { - tag "$sample_id" - - publishDir "${params.outdir}/${sample_id}", mode: 'copy', overwrite: true, pattern: '*.{json,html}' - - input: - tuple val(sample_id), path(fastq) - val extension - - output: - tuple val(sample_id), file("${sample_id}${extension}.R[1,2].fq") - file "${sample_id}.QC.fastp.json" - file "${sample_id}.QC.fastp.html" - - script: - """ - fastp \ - -i ${fastq[0]} \ - -o ${sample_id}${extension}.R1.fq \ - -I ${fastq[1]} \ - -O ${sample_id}${extension}.R2.fq \ - -g -W 5 -q 20 -u 40 -x -3 -l 75 -c \ - -j ${sample_id}.QC.fastp.json \ - -h ${sample_id}.QC.fastp.html \ - -w 12 - """ -} - -// Alignement before deduplication: Align quality and adapter trimmed reads to -// the reference genome -process BWAmem { - tag "$sample_id" - clusterOptions '-n 10' - - input: - tuple val(sample_id), path(fastq) - val opt_bwa - val extension - - output: - tuple val(sample_id), file("${sample_id}${extension}.bam") - - """ - bwa mem \ - ${opt_bwa} \ - -M ${params.ref} \ - ${fastq[0]} \ - ${fastq[1]} \ - | \ - samtools view -bh -o ${sample_id}${extension}.bam - """ -} - -// Merge the two BAM files containing: -// 1: the UMI information: output of ExtractUmis process -// 2: the alignment coordinate information: output of bwaMEM process -process MergeBam { - tag "$sample_id" - - input: - tuple val(sample_id), path(bam_aligned), path(bam_unmapped) - val extension - - - output: - tuple val(sample_id), file("${sample_id}${extension}.ba[i,m]") - - """ - gatk MergeBamAlignment \ - --ATTRIBUTES_TO_RETAIN X0 \ - --ATTRIBUTES_TO_REMOVE NM \ - --ATTRIBUTES_TO_REMOVE MD \ - --ALIGNED_BAM ${bam_aligned} \ - --UNMAPPED_BAM ${bam_unmapped} \ - --OUTPUT ${sample_id}${extension}.bam \ - --REFERENCE_SEQUENCE ${params.ref} \ - --SORT_ORDER 'queryname' \ - --ALIGNED_READS_ONLY true \ - --MAX_INSERTIONS_OR_DELETIONS -1 \ - --PRIMARY_ALIGNMENT_STRATEGY MostDistant \ - --ALIGNER_PROPER_PAIR_FLAGS true \ - --CREATE_INDEX true \ - --CLIP_OVERLAPPING_READS false - """ -} - -// -process UmiMergeFilt { - tag "$sample_id" - - input: - tuple val(sample_id), path(bam) - val extension - - output: - tuple val(sample_id), file("${sample_id}${extension}.bam") - - """ - samtools view \ - -f2 \ - -bh ${bam} \ - > ${sample_id}${extension}.bam - """ -} - -// Identify and group reads originating from the same source molecule -// The user can control how many errors/mismatches are allowed in the UMI sequence when assigning source molecules (--edits=n). -process GroupReads { - tag "$sample_id" - - publishDir "${params.outdir}/${sample_id}", mode: 'copy', overwrite: true, pattern: '*.txt' - - input: - tuple val(sample_id), path(bam) - val extension - - output: - tuple val(sample_id), file("${sample_id}${extension}.bam"), emit: nextout - file "${sample_id}.QC.family_size_counts.txt" - - """ - fgbio GroupReadsByUmi \ - -i ${bam} \ - -o ${sample_id}${extension}.bam \ - --strategy=adjacency \ - --edits=1 \ - -t RX \ - -f ${sample_id}.QC.family_size_counts.txt - """ -} - -// Generate consensus reads -// Calculate the consensus sequence, Reads that occur as singletons are -// discarded by default but this can be changed by setting the –min-reads flag -// to 1, in so doing the single read will be considered the consensus. -process CallConsensus { - tag "$sample_id" - - input: - tuple val(sample_id), path(bam) - val extension - - output: - tuple val(sample_id), file("${sample_id}${extension}.bam") - - """ - fgbio CallMolecularConsensusReads \ - -i ${bam} \ - -o ${sample_id}${extension}.bam \ - --error-rate-post-umi 40 \ - --error-rate-pre-umi 45 \ - --output-per-base-tags false \ - --min-reads 2 \ - --max-reads 50 \ - --read-name-prefix='consensus' - """ -} - -workflow RerunConvertSamToFastq { - take: - tuple_input - extension - main: - ConvertSamToFastq(tuple_input, extension) - emit: - final_out = ConvertSamToFastq.out -} - -workflow RerunBWAmem { - take: - tuple_input - opt_bwa - extension - main: - BWAmem(tuple_input, opt_bwa, extension) - emit: - final_out = BWAmem.out -} - -// Sort the consensus_mapped.bam with the consensus_unmapped.bam to prepare them as input for the next step -process SortConsensus { - tag "$sample_id" - - input: - tuple val(sample_id), path(bam) - val extension - - output: - tuple val(sample_id), path("${sample_id}${extension}.sort.bam") - - script: - - """ - gatk SortSam \ - -I ${bam} \ - --OUTPUT ${sample_id}${extension}.sort.bam \ - --SORT_ORDER queryname - """ -} - -workflow RerunSortConsensus { - take: - tuple_input - extension - main: - SortConsensus(tuple_input, extension) - emit: - final_out = SortConsensus.out -} - -// Finally, merge the consensus_mapped.bam with the consensus_unmapped.bam to -// retain the UMI group information. -process MergeBam2 { - tag "$sample_id" - - publishDir "${params.outdir}/${sample_id}", mode: 'copy', overwrite: true - - input: - tuple val(sample_id), path(bam_aligned), path(bam_unmapped) - val extension - - output: - tuple val(sample_id), path("${sample_id}${extension}.ba[m,i]") - - """ - gatk MergeBamAlignment \ - --ATTRIBUTES_TO_RETAIN X0 \ - --ATTRIBUTES_TO_RETAIN RX \ - --ALIGNED_BAM ${bam_aligned} \ - --UNMAPPED_BAM ${bam_unmapped} \ - --OUTPUT ${sample_id}${extension}.bam \ - --REFERENCE_SEQUENCE ${params.ref} \ - --SORT_ORDER coordinate \ - --ADD_MATE_CIGAR true \ - --MAX_INSERTIONS_OR_DELETIONS -1 \ - --PRIMARY_ALIGNMENT_STRATEGY MostDistant \ - --ALIGNER_PROPER_PAIR_FLAGS true \ - --CREATE_INDEX true \ - --CLIP_OVERLAPPING_READS false - """ -} - -// Variant calling step using vardict -process VarDict { - tag "$sample_id" - - input: - tuple val(sample_id), path(bami) - val extension - - output: - tuple val(sample_id), file("${sample_id}${extension}.vcf") - - """ - vardict \ - -G ${params.ref} \ - -f 0.0005 \ - -N ${sample_id} \ - -b ${bami[1]} \ - -c 1 \ - -S 2 \ - -E 3 \ - -g 4 ${params.bed} \ - | $params.teststrandbias \ - | $params.var2vcf > ${sample_id}${extension}.vcf - """ -} - -// Annotation step using VEP -process AnnotationVEP { - tag "$sample_id" - - publishDir "${params.outdir}/${sample_id}", mode: 'copy', overwrite: true - - input: - tuple val(sample_id), path(vcf) - val extension - - output: - tuple val(sample_id), file("${sample_id}${extension}.vcf") - - """ - vep \ - -i ${vcf} \ - -o ${sample_id}${extension}.vcf \ - --cache \ - --dir_cache ${params.cache} \ - --offline \ - --force_overwrite \ - --vcf \ - --numbers \ - --refseq \ - --symbol \ - --hgvs \ - --canonical \ - --max_af \ - --fasta ${params.fasta} - """ -} - -process BedToIntervalList { - - input: - path dict - path bed - val extension - - output: - file "${extension}.interval_list" - - """ - picard BedToIntervalList \ - --SEQUENCE_DICTIONARY ${dict} \ - --INPUT ${bed} \ - --OUTPUT ${extension}.interval_list - """ -} - -// Extraction of read quality metrics before deduplication -process CollectHsMetrics { - tag "$sample_id" - - publishDir "${params.outdir}/${sample_id}", mode: 'copy', overwrite: true - - input: - tuple val(sample_id), file(bam) - file bed - val extension - - output: - tuple val(sample_id), file("${sample_id}${extension}.txt") - - """ - picard CollectHsMetrics \ - --REFERENCE_SEQUENCE ${params.ref} \ - --BAIT_INTERVALS ${bed} \ - --TARGET_INTERVALS ${bed} \ - --INPUT ${bam} \ - --OUTPUT ${sample_id}${extension}.txt - """ -} - -workflow RerunCollectHsMetrics { - take: - tuple_input - bed - extension - main: - CollectHsMetrics(tuple_input, bed, extension) - emit: - final_out = CollectHsMetrics.out -} - -process BCFtools_stats { - tag "${sample_id}" - - publishDir "${params.outdir}/${sample_id}", mode: 'copy', overwrite: true - - input: - tuple val(sample_id), path(file) - val extension - - output: - tuple val(sample_id), file("${sample_id}${extension}.stats") - - """ - bcftools stats ${file} > ${sample_id}${extension}.stats - """ -} - -// Generate a multi-quality control report from collected metrics data (process -// collectmetrics2 output). -process MultiQC { - tag "${sample_id}" - - publishDir "${params.outdir}/${sample_id}", mode: 'copy', overwrite: true - - input: - tuple val(sample_id), path(file) - val extension - - output: - tuple val(sample_id), file("${sample_id}${extension}") - - """ - multiqc ${params.outdir}/${sample_id} -o ${sample_id}${extension} - """ -} - -process MultiQC_ALL { - publishDir "${params.outdir}", mode: 'copy', overwrite: true - - input: - tuple val(sample_id), path(file) - val extension - - output: - file("${extension}") - - """ - multiqc ${params.outdir} -o ${extension} - """ -} - +// Include modules +include {ConvertFastqToSam; MergeBam; MergeBam2; SortConsensus; SortConsensus as RerunSortConsensus; ConvertSamToFastq; ConvertSamToFastq as RerunConvertSamToFastq } from "$baseDir/modules/gatk.nf" +include {BedToIntervalList; CollectHsMetrics; CollectHsMetrics as RerunCollectHsMetrics;} from "$baseDir/modules/picard.nf" +include {BWAmem; BWAmem as RerunBWAmem;} from "$baseDir/modules/bwamem.nf" +include {BCFtools_stats} from "$baseDir/modules/bcftools.nf" +include {Fastp} from "$baseDir/modules/fastp.nf" +include {ExtractUmis; GroupReads; CallConsensus; } from "$baseDir/modules/fgbio.nf" +include {MultiQC; MultiQC_ALL; } from "$baseDir/modules/multiqc.nf" +include {UmiMergeFilt} from "$baseDir/modules/samtools.nf" +include {VarDict} from "$baseDir/modules/vardict.nf" +include {AnnotationVEP} from "$baseDir/modules/vep.nf" + + +// Define the workflow workflow { Channel.fromFilePairs(params.fastq, checkIfExists:true) .filter{ v -> v=~ params.filter_fastq} @@ -518,11 +59,11 @@ workflow { // 3. Post process deduplication RerunConvertSamToFastq(CallConsensus.out, ".3.unmapped") - RerunBWAmem(RerunConvertSamToFastq.out.final_out, "-t 10 -Y", ".3.consensus_mapped") + RerunBWAmem(RerunConvertSamToFastq.out, "-t 10 -Y", ".3.consensus_mapped") SortConsensus(CallConsensus.out, ".3.unmapped") - RerunSortConsensus(RerunBWAmem.out.final_out, ".3.mapped") + RerunSortConsensus(RerunBWAmem.out, ".3.mapped") - RerunSortConsensus.out.final_out.join(SortConsensus.out).set{bams_consensus} + RerunSortConsensus.out.join(SortConsensus.out).set{bams_consensus} MergeBam2(bams_consensus, ".3.merged") // 4. Variant Calling & Annotation @@ -532,7 +73,7 @@ workflow { // Quality Controls BedToIntervalList(params.dict, params.bed, params.bed.tokenize('.')[0].tokenize('/')[-1]) CollectHsMetrics(BWAmem.out, BedToIntervalList.out, ".QC.HsMetrics.1") - RerunCollectHsMetrics(RerunBWAmem.out.final_out, BedToIntervalList.out, ".QC.HsMetrics.3") + RerunCollectHsMetrics(RerunBWAmem.out, BedToIntervalList.out, ".QC.HsMetrics.3") BCFtools_stats(VarDict.out, ".QC.bcftools_stats") MultiQC(BCFtools_stats.out, ".QC.multiQC") diff --git a/README.md b/README.md index 74e72ff..d08d2e0 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,22 @@ # MobiCT ctDNA Analysis pipeline. Version 1.0.0 -# Introduction +## Table of Contents + +* [Introduction](#introduction) +* [Quick Start](#quick-start) +* [Installation](#installation) + * [Nextflow](#nextflow) + * [Execution environment](#execution-environment) + * [Container platform](#container-platform) + * [Docker](#docker) + * [Singularity](#singularity) + * [Conda](conda) +* [Usage](#usage) +* [Pipeline output](#pipeline-output) +* [Data availability](#data-availability) + +## Introduction Mobict is an analysis pipeline designed for detecting SNV and small InDels in circulating tumor DNA (ctDNA) obtained through non-invasive liquid biopsy, serving diagnostic, prognostic, and therapeutic purposes. The pipeline is built using Nextflow. The associated publication is available here: /doi/.../ @@ -20,13 +35,118 @@ The associated publication is available here: /doi/.../ 4. Download the reference genome 5. Download the datasets needed by VEP (see https://github.com/Ensembl/ensembl-vep) 6. Edit the *.config* file with input and output files/paths -7. Activate your conda environment -8. Run MobiCT on your Dataset +7. Run MobiCT on your Dataset `Nextflow -log /output_directory/my.log run MobiCT.nf -c nextflow.config` -# Pipeline output +## Installation + +The prerequisites to run the pipeline are: + + * [Nextflow](https://www.nextflow.io/) >= 22.04.0 + * [Docker](https://www.docker.com) or [Singularity](https://sylabs.io/singularity/) or [conda](https://juke34.github.io/fix-anaconda-licensing-issues/en/pages/conda-distrib/) + +### Nextflow + + * Via conda + +
+ See here + + ```bash + conda create -n nextflow + conda activate nextflow + conda install bioconda::nextflow + ``` +
+ + * Manually +
+ See here + Nextflow runs on most POSIX systems (Linux, macOS, etc) and can typically be installed by running these commands: + + ```bash + # Make sure 11 or later is installed on your computer by using the command: + java -version + + # Install Nextflow by entering this command in your terminal(it creates a file nextflow in the current dir): + curl -s https://get.nextflow.io | bash + + # Add Nextflow binary to your user's PATH: + mv nextflow ~/bin/ + # OR system-wide installation: + # sudo mv nextflow /usr/local/bin + ``` +
+ +### Execution environment + +#### Container platform + +To run the workflow you will need a container platform: docker or singularity. + +##### Docker + +Please follow the instructions at the [Docker website](https://docs.docker.com/desktop/) + +##### Singularity + +Please follow the instructions at the [Singularity website](https://docs.sylabs.io/guides/latest/admin-guide/installation.html) + +#### Conda + +It is recommended to use a container platform. However, if you prefer to use Conda, please follow the instructions available [here](https://juke34.github.io/fix-anaconda-licensing-issues/en/pages/conda-installation/) + +## Usage + +### Profile + +To run the workflow you must select a profile according to the container platform you want to use: +- `singularity`, a profile using Singularity containers to run the softwares +- `docker`, a profile using Docker containers to run the softwares +- `conda`, a profile using conda environments to run the softwares + + +### From Release + +#### Default usage +The command will look like that: + +```bash +nextflow run SimCab-CHU/MobiCT -r v1.0.0 -profile docker +``` + +#### Test + +```bash +nextflow run SimCab-CHU/MobiCT -r v1.0.0 -profile docker,test +``` + +### From repo + +```bash +# clone the workflow repository +git clone https://github.com/SimCab-CHU/MobiCT.git + +# Move in it +cd MobiCT +``` +#### Default usage + +```bash +# Run MobiCT +nextflow run MobiCT.nf -profile docker +``` + +#### Test + +```bash +# Run MobiCT +nextflow run MobiCT.nf -profile docker,test +``` + +## Pipeline output The output files are stored in the directory you specified using the **outdir** parameter in the .config file. The **outdir** contains a folder per sample plus a **multiQC** folder. Each sample folder contains a deduplicated and aligned *.bam* file and its index, an annotated *.vcf* file, 3 metrics files (*.HsMetrics.1.txt*, *.HsMetrics.3.txt* and *QC.bcftools_stats.stats*). -# Data availability +## Data availability Raw sequencing data (FASTQ files) of commercial controls used in the study are available @ https://www.ncbi.nlm.nih.gov/sra/PRJNA1209006. diff --git a/config/ressources/conda/bcftools.yaml b/config/ressources/conda/bcftools.yaml new file mode 100644 index 0000000..fc17e15 --- /dev/null +++ b/config/ressources/conda/bcftools.yaml @@ -0,0 +1,7 @@ +name: gatk4 +channels: + - conda-forge + - bioconda + +dependencies: + - bcftools=1.9 \ No newline at end of file diff --git a/config/ressources/conda/bwa.yaml b/config/ressources/conda/bwa.yaml new file mode 100644 index 0000000..7f710ae --- /dev/null +++ b/config/ressources/conda/bwa.yaml @@ -0,0 +1,7 @@ +name: gatk4 +channels: + - conda-forge + - bioconda + +dependencies: + - bwa=0.7.19 \ No newline at end of file diff --git a/config/ressources/conda/ensembl-vep.yaml b/config/ressources/conda/ensembl-vep.yaml new file mode 100644 index 0000000..a9ee1c5 --- /dev/null +++ b/config/ressources/conda/ensembl-vep.yaml @@ -0,0 +1,7 @@ +name: gatk4 +channels: + - conda-forge + - bioconda + +dependencies: + - ensembl-vep=113.0 \ No newline at end of file diff --git a/config/ressources/conda/fastp.yaml b/config/ressources/conda/fastp.yaml new file mode 100644 index 0000000..8672409 --- /dev/null +++ b/config/ressources/conda/fastp.yaml @@ -0,0 +1,7 @@ +name: gatk4 +channels: + - conda-forge + - bioconda + +dependencies: + - fastp=0.24.0 \ No newline at end of file diff --git a/config/ressources/conda/fgbio.yaml b/config/ressources/conda/fgbio.yaml new file mode 100644 index 0000000..348db93 --- /dev/null +++ b/config/ressources/conda/fgbio.yaml @@ -0,0 +1,7 @@ +name: gatk4 +channels: + - conda-forge + - bioconda + +dependencies: + - fgbio=2.5.0 \ No newline at end of file diff --git a/config/ressources/conda/gatk4.yaml b/config/ressources/conda/gatk4.yaml new file mode 100644 index 0000000..079a8dc --- /dev/null +++ b/config/ressources/conda/gatk4.yaml @@ -0,0 +1,7 @@ +name: gatk4 +channels: + - conda-forge + - bioconda + +dependencies: + - gatk4=4.6.1.0 \ No newline at end of file diff --git a/config/ressources/conda/multiqc.yaml b/config/ressources/conda/multiqc.yaml new file mode 100644 index 0000000..f630102 --- /dev/null +++ b/config/ressources/conda/multiqc.yaml @@ -0,0 +1,7 @@ +name: gatk4 +channels: + - conda-forge + - bioconda + +dependencies: + - multiqc=1.28 \ No newline at end of file diff --git a/config/ressources/conda/picard.yaml b/config/ressources/conda/picard.yaml new file mode 100644 index 0000000..61da0f6 --- /dev/null +++ b/config/ressources/conda/picard.yaml @@ -0,0 +1,7 @@ +name: gatk4 +channels: + - conda-forge + - bioconda + +dependencies: + - picard=3.3.0 \ No newline at end of file diff --git a/config/ressources/conda/samtools.yaml b/config/ressources/conda/samtools.yaml new file mode 100644 index 0000000..9beff77 --- /dev/null +++ b/config/ressources/conda/samtools.yaml @@ -0,0 +1,7 @@ +name: gatk4 +channels: + - conda-forge + - bioconda + +dependencies: + - samtools=1.6 \ No newline at end of file diff --git a/config/ressources/conda/vardict.yaml b/config/ressources/conda/vardict.yaml new file mode 100644 index 0000000..9af8e25 --- /dev/null +++ b/config/ressources/conda/vardict.yaml @@ -0,0 +1,7 @@ +name: gatk4 +channels: + - conda-forge + - bioconda + +dependencies: + - vardict=2018.10.18 \ No newline at end of file diff --git a/config/ressources/hpc.config b/config/ressources/hpc.config new file mode 100644 index 0000000..9156f0c --- /dev/null +++ b/config/ressources/hpc.config @@ -0,0 +1,92 @@ +process { + cpus = 1 + time = '1h' + maxForks = 20 + shell = ['/bin/bash', '-euo', 'pipefail'] + stageOutMode = 'rsync' + + withLabel: 'bcftools' { + cpus = 1 + time = '1h' + } + withLabel: 'bwamem' { + cpus = 10 + time = '24h' + } + withLabel: 'bowtie' { + cpus = 16 + time = '4h' + } + withLabel: 'bowtie2' { + cpus = 16 + time = '4h' + } + withLabel: 'bwa' { + cpus = 16 + time = '4h' + } + withName: 'fastp' { + cpus = 16 + time = '2h' + } + withLabel: 'fastqc' { + cpus = 8 + time = '1h' + } + withLabel: 'hisat2' { + cpus = 16 + time = '4h' + } + withLabel: 'kallisto' { + cpus = 16 + time = '4h' + } + withLabel: 'graphmap2' { + cpus = 16 + time = '4h' + } + withLabel: 'minimap2' { + cpus = 16 + time = '4h' + } + withLabel: 'multiqc' { + cpus = 8 + time = '1h' + } + withLabel: 'mummer4' { + cpus = 16 + time = '4h' + } + withLabel: 'ngmlr' { + cpus = 16 + time = '1h' + } + withLabel: 'novoalign' { + cpus = 16 + time = '4h' + } + withLabel: 'salmon' { + cpus = 16 + time = '4h' + } + withLabel: 'samtools' { + cpus = 16 + time = '2h' + } + withLabel: 'seqkit' { + cpus = 8 + time = '4h' + } + withLabel: 'seqtk' { + cpus = 16 + time = '4h' + } + withLabel: 'star' { + cpus = 16 + time = '4h' + } + withLabel: 'subread' { + cpus = 16 + time = '4h' + } +} diff --git a/config/ressources/local.config b/config/ressources/local.config new file mode 100644 index 0000000..9774c2c --- /dev/null +++ b/config/ressources/local.config @@ -0,0 +1,48 @@ +process { + cpus = 1 + time = '1h' + maxForks = 8 + shell = ['/bin/bash', '-euo', 'pipefail'] + stageOutMode = 'rsync' + + withLabel: 'bcftools' { + cpus = 4 + time = '1h' + } + withLabel: 'bwamem' { + cpus = 4 + time = '4h' + } + withLabel: 'fastp' { + cpus = 4 + time = '4h' + } + withLabel: 'fgbio' { + cpus = 4 + time = '4h' + } + withLabel: 'gatk4' { + cpus = 4 + time = '4h' + } + withName: 'multiqc' { + cpus = 4 + time = '2h' + } + withLabel: 'picard' { + cpus = 4 + time = '1h' + } + withLabel: 'samtools' { + cpus = 4 + time = '4h' + } + withLabel: 'vardict' { + cpus = 2 + time = '4h' + } + withLabel: 'vep' { + cpus = 4 + time = '4h' + } +} diff --git a/config/softwares.config b/config/softwares.config new file mode 100644 index 0000000..a3564c8 --- /dev/null +++ b/config/softwares.config @@ -0,0 +1,52 @@ +process { + withLabel: 'bcftools' { + conda = { singularity.enabled || docker.enabled ? '' : "$baseDir/config/ressources/conda/bcftools.yaml" } + container = { singularity.enabled ? 'https://depot.galaxyproject.org/singularity/bcftools:1.9--h4da6232_0' : + docker.enabled ? 'quay.io/biocontainers/bcftools:1.9--h4da6232_0' : '' } + } + withLabel: 'bwamem' { + conda = { singularity.enabled || docker.enabled ? '' : "$baseDir/config/ressources/conda/bwa.yaml" } + container = { singularity.enabled ? 'https://depot.galaxyproject.org/singularity/bwa:0.7.19--h577a1d6_0' : + docker.enabled ? 'quay.io/biocontainers/bwa:0.7.19--h577a1d6_0' : '' } + } + withLabel: 'fastp' { + conda = { singularity.enabled || docker.enabled ? '' : "$baseDir/config/ressources/conda/fastp.yaml" } + container = { singularity.enabled ? 'https://depot.galaxyproject.org/singularity/fastp:0.24.0--heae3180_1' : + docker.enabled ? 'quay.io/biocontainers/fastp:0.24.0--heae3180_1' : '' } + } + withLabel: 'fgbio' { + conda = { singularity.enabled || docker.enabled ? '' : "$baseDir/config/ressources/conda/fgbio.yaml" } + container = { singularity.enabled ? 'https://depot.galaxyproject.org/singularity/fgbio:2.5.0--hdfd78af_1' : + docker.enabled ? 'quay.io/biocontainers/fgbio:2.5.0--hdfd78af_1' : '' } + } + withLabel: 'gatk4' { + conda = { singularity.enabled || docker.enabled ? '' : "$baseDir/config/ressources/conda/gatk4.yaml" } + container = { singularity.enabled ? 'https://depot.galaxyproject.org/singularity/gatk4:4.6.1.0--py310hdfd78af_0' : + docker.enabled ? 'quay.io/biocontainers/gatk4:4.6.1.0--py310hdfd78af_0' : '' } + } + withLabel: 'multiqc' { + conda = { singularity.enabled || docker.enabled ? '' : "$baseDir/config/ressources/conda/multiqc.yaml" } + container = { singularity.enabled ? 'https://depot.galaxyproject.org/singularity/multiqc:1.28--pyhdfd78af_0' : + docker.enabled ? 'quay.io/biocontainers/multiqc:1.28--pyhdfd78af_0' : '' } + } + withLabel: 'picard' { + conda = { singularity.enabled || docker.enabled ? '' : "$baseDir/config/ressources/conda/picard.yaml" } + container = { singularity.enabled ? 'https://depot.galaxyproject.org/singularity/picard:3.3.0--hdfd78af_0' : + docker.enabled ? 'quay.io/biocontainers/picard:3.3.0--hdfd78af_0' : '' } + } + withLabel: 'samtools' { + conda = { singularity.enabled || docker.enabled ? '' : "$baseDir/config/ressources/conda/samtools.yaml" } + container = { singularity.enabled ? 'https://depot.galaxyproject.org/singularity/samtools:1.6--h5fe306e_11' : + docker.enabled ? 'quay.io/biocontainers/samtools:1.6--h5fe306e_11' : '' } + } + withLabel: 'vardict' { + conda = { singularity.enabled || docker.enabled ? '' : "$baseDir/config/ressources/conda/vardict.yaml" } + container = { singularity.enabled ? 'https://depot.galaxyproject.org/singularity/vardict:2018.10.18--pl526_1' : + docker.enabled ? 'quay.io/biocontainers/vardict:2018.10.18--pl526_1' : '' } + } + withLabel: 'vep' { + conda = { singularity.enabled || docker.enabled ? '' : "$baseDir/config/ressources/conda/ensembl-vep.yaml" } + container = { singularity.enabled ? 'https://depot.galaxyproject.org/singularity/ensembl-vep:113.0--pl5321h2a3209d_0' : + docker.enabled ? 'quay.io/biocontainers/ensembl-vep:113.0--pl5321h2a3209d_0' : '' } + } +} \ No newline at end of file diff --git a/modules/bcftools.nf b/modules/bcftools.nf new file mode 100644 index 0000000..961c5c2 --- /dev/null +++ b/modules/bcftools.nf @@ -0,0 +1,17 @@ +process BCFtools_stats { + tag "${sample_id}" + label 'bcftools' + + publishDir "${params.outdir}/${sample_id}", mode: 'copy', overwrite: true + + input: + tuple val(sample_id), path(file) + val extension + + output: + tuple val(sample_id), file("${sample_id}${extension}.stats") + + """ + bcftools stats ${file} > ${sample_id}${extension}.stats + """ +} diff --git a/modules/bwamem.nf b/modules/bwamem.nf new file mode 100644 index 0000000..6a46753 --- /dev/null +++ b/modules/bwamem.nf @@ -0,0 +1,24 @@ +// Alignement before deduplication: Align quality and adapter trimmed reads to +// the reference genome +process BWAmem { + tag "$sample_id" + label 'bwamem' + + input: + tuple val(sample_id), path(fastq) + val opt_bwa + val extension + + output: + tuple val(sample_id), file("${sample_id}${extension}.bam") + + """ + bwa mem \ + ${opt_bwa} \ + -M ${params.ref} \ + ${fastq[0]} \ + ${fastq[1]} \ + | \ + samtools view -bh -o ${sample_id}${extension}.bam + """ +} \ No newline at end of file diff --git a/modules/fastp.nf b/modules/fastp.nf new file mode 100644 index 0000000..2856a8e --- /dev/null +++ b/modules/fastp.nf @@ -0,0 +1,29 @@ +// Adapter and quality trimming +process Fastp { + tag "$sample_id" + label 'fastp' + + publishDir "${params.outdir}/${sample_id}", mode: 'copy', overwrite: true, pattern: '*.{json,html}' + + input: + tuple val(sample_id), path(fastq) + val extension + + output: + tuple val(sample_id), file("${sample_id}${extension}.R[1,2].fq") + file "${sample_id}.QC.fastp.json" + file "${sample_id}.QC.fastp.html" + + script: + """ + fastp \ + -i ${fastq[0]} \ + -o ${sample_id}${extension}.R1.fq \ + -I ${fastq[1]} \ + -O ${sample_id}${extension}.R2.fq \ + -g -W 5 -q 20 -u 40 -x -3 -l 75 -c \ + -j ${sample_id}.QC.fastp.json \ + -h ${sample_id}.QC.fastp.html \ + -w 12 + """ +} \ No newline at end of file diff --git a/modules/fgbio.nf b/modules/fgbio.nf new file mode 100644 index 0000000..cd9b802 --- /dev/null +++ b/modules/fgbio.nf @@ -0,0 +1,76 @@ +// Extraction of UMIs from the insert reads +process ExtractUmis { + tag "$sample_id" + label 'fgbio' + + input: + tuple val(sample_id), path(bam_file) + val struct_r1 + val struct_r2 + val extension + + output: + tuple val(sample_id), file("${sample_id}${extension}.bam") + + """ + fgbio ExtractUmisFromBam \ + -i ${bam_file} \ + -o ${sample_id}${extension}.bam \ + -r ${struct_r1} ${struct_r2} \ + -t RX \ + -a true + """ +} + +// Identify and group reads originating from the same source molecule +// The user can control how many errors/mismatches are allowed in the UMI sequence when assigning source molecules (--edits=n). +process GroupReads { + tag "$sample_id" + + publishDir "${params.outdir}/${sample_id}", mode: 'copy', overwrite: true, pattern: '*.txt' + + input: + tuple val(sample_id), path(bam) + val extension + + output: + tuple val(sample_id), file("${sample_id}${extension}.bam"), emit: nextout + file "${sample_id}.QC.family_size_counts.txt" + + """ + fgbio GroupReadsByUmi \ + -i ${bam} \ + -o ${sample_id}${extension}.bam \ + --strategy=adjacency \ + --edits=1 \ + -t RX \ + -f ${sample_id}.QC.family_size_counts.txt + """ +} + +// Generate consensus reads +// Calculate the consensus sequence, Reads that occur as singletons are +// discarded by default but this can be changed by setting the –min-reads flag +// to 1, in so doing the single read will be considered the consensus. +process CallConsensus { + tag "$sample_id" + + input: + tuple val(sample_id), path(bam) + val extension + + output: + tuple val(sample_id), file("${sample_id}${extension}.bam") + + """ + fgbio CallMolecularConsensusReads \ + -i ${bam} \ + -o ${sample_id}${extension}.bam \ + --error-rate-post-umi 40 \ + --error-rate-pre-umi 45 \ + --output-per-base-tags false \ + --min-reads 2 \ + --max-reads 50 \ + --read-name-prefix='consensus' + """ +} \ No newline at end of file diff --git a/modules/gatk.nf b/modules/gatk.nf new file mode 100644 index 0000000..8050325 --- /dev/null +++ b/modules/gatk.nf @@ -0,0 +1,133 @@ +// Convert the demultiplexed, raw sequencing FASTQ files to BAM +process ConvertFastqToSam { + tag "$sample_id" + label 'gatk4' + + input: + tuple val('sample_id'), path(fastq) + val extension + + output: + tuple val(sample_id), file("${sample_id}${extension}.bam") + + """ + gatk FastqToSam \ + --FASTQ ${fastq[0]} \ + --FASTQ2 ${fastq[1]} \ + --OUTPUT ${sample_id}${extension}.bam \ + --SAMPLE_NAME ${sample_id} \ + --TMP_DIR ${params.tmp_dir} + """ +} + +// Convert the BAM file with UMI extracted reads to a FASTQ file +process ConvertSamToFastq { + tag "$sample_id" + label 'gatk4' + + input: + tuple val(sample_id), path(bam_file) + val extension + + output: + tuple val(sample_id), file("${sample_id}${extension}.R[1,2].fq") + + """ + gatk SamToFastq \ + -I ${bam_file} \ + -F ${sample_id}${extension}.R1.fq \ + -F2 ${sample_id}${extension}.R2.fq \ + --CLIPPING_ATTRIBUTE XT \ + --CLIPPING_ACTION 2 \ + --MAX_RECORDS_IN_RAM 50000000 + """ +} + +// Merge the two BAM files containing: +// 1: the UMI information: output of ExtractUmis process +// 2: the alignment coordinate information: output of bwaMEM process +process MergeBam { + tag "$sample_id" + label 'gatk4' + + input: + tuple val(sample_id), path(bam_aligned), path(bam_unmapped) + val extension + + + output: + tuple val(sample_id), file("${sample_id}${extension}.ba[i,m]") + + """ + gatk MergeBamAlignment \ + --ATTRIBUTES_TO_RETAIN X0 \ + --ATTRIBUTES_TO_REMOVE NM \ + --ATTRIBUTES_TO_REMOVE MD \ + --ALIGNED_BAM ${bam_aligned} \ + --UNMAPPED_BAM ${bam_unmapped} \ + --OUTPUT ${sample_id}${extension}.bam \ + --REFERENCE_SEQUENCE ${params.ref} \ + --SORT_ORDER 'queryname' \ + --ALIGNED_READS_ONLY true \ + --MAX_INSERTIONS_OR_DELETIONS -1 \ + --PRIMARY_ALIGNMENT_STRATEGY MostDistant \ + --ALIGNER_PROPER_PAIR_FLAGS true \ + --CREATE_INDEX true \ + --CLIP_OVERLAPPING_READS false + """ +} + +// Sort the consensus_mapped.bam with the consensus_unmapped.bam to prepare them as input for the next step +process SortConsensus { + tag "$sample_id" + label 'gatk4' + + input: + tuple val(sample_id), path(bam) + val extension + + output: + tuple val(sample_id), path("${sample_id}${extension}.sort.bam") + + script: + + """ + gatk SortSam \ + -I ${bam} \ + --OUTPUT ${sample_id}${extension}.sort.bam \ + --SORT_ORDER queryname + """ +} + +// Finally, merge the consensus_mapped.bam with the consensus_unmapped.bam to +// retain the UMI group information. +process MergeBam2 { + tag "$sample_id" + label 'gatk4' + + publishDir "${params.outdir}/${sample_id}", mode: 'copy', overwrite: true + + input: + tuple val(sample_id), path(bam_aligned), path(bam_unmapped) + val extension + + output: + tuple val(sample_id), path("${sample_id}${extension}.ba[m,i]") + + """ + gatk MergeBamAlignment \ + --ATTRIBUTES_TO_RETAIN X0 \ + --ATTRIBUTES_TO_RETAIN RX \ + --ALIGNED_BAM ${bam_aligned} \ + --UNMAPPED_BAM ${bam_unmapped} \ + --OUTPUT ${sample_id}${extension}.bam \ + --REFERENCE_SEQUENCE ${params.ref} \ + --SORT_ORDER coordinate \ + --ADD_MATE_CIGAR true \ + --MAX_INSERTIONS_OR_DELETIONS -1 \ + --PRIMARY_ALIGNMENT_STRATEGY MostDistant \ + --ALIGNER_PROPER_PAIR_FLAGS true \ + --CREATE_INDEX true \ + --CLIP_OVERLAPPING_READS false + """ +} \ No newline at end of file diff --git a/modules/multiqc.nf b/modules/multiqc.nf new file mode 100644 index 0000000..2c10719 --- /dev/null +++ b/modules/multiqc.nf @@ -0,0 +1,34 @@ +// Generate a multi-quality control report from collected metrics data (process +// collectmetrics2 output). +process MultiQC { + tag "${sample_id}" + label 'multiqc' + + publishDir "${params.outdir}/${sample_id}", mode: 'copy', overwrite: true + + input: + tuple val(sample_id), path(file) + val extension + + output: + tuple val(sample_id), file("${sample_id}${extension}") + + """ + multiqc ${params.outdir}/${sample_id} -o ${sample_id}${extension} + """ +} + +process MultiQC_ALL { + publishDir "${params.outdir}", mode: 'copy', overwrite: true + + input: + tuple val(sample_id), path(file) + val extension + + output: + file("${extension}") + + """ + multiqc ${params.outdir} -o ${extension} + """ +} \ No newline at end of file diff --git a/modules/picard.nf b/modules/picard.nf new file mode 100644 index 0000000..798e6d1 --- /dev/null +++ b/modules/picard.nf @@ -0,0 +1,43 @@ +process BedToIntervalList { + label 'picard' + + input: + path dict + path bed + val extension + + output: + file "${extension}.interval_list" + + """ + picard BedToIntervalList \ + --SEQUENCE_DICTIONARY ${dict} \ + --INPUT ${bed} \ + --OUTPUT ${extension}.interval_list + """ +} + +// Extraction of read quality metrics before deduplication +process CollectHsMetrics { + tag "$sample_id" + label 'picard' + + publishDir "${params.outdir}/${sample_id}", mode: 'copy', overwrite: true + + input: + tuple val(sample_id), file(bam) + file bed + val extension + + output: + tuple val(sample_id), file("${sample_id}${extension}.txt") + + """ + picard CollectHsMetrics \ + --REFERENCE_SEQUENCE ${params.ref} \ + --BAIT_INTERVALS ${bed} \ + --TARGET_INTERVALS ${bed} \ + --INPUT ${bam} \ + --OUTPUT ${sample_id}${extension}.txt + """ +} diff --git a/modules/samtools.nf b/modules/samtools.nf new file mode 100644 index 0000000..435f7c9 --- /dev/null +++ b/modules/samtools.nf @@ -0,0 +1,19 @@ +// +process UmiMergeFilt { + tag "$sample_id" + label 'samtools' + + input: + tuple val(sample_id), path(bam) + val extension + + output: + tuple val(sample_id), file("${sample_id}${extension}.bam") + + """ + samtools view \ + -f2 \ + -bh ${bam} \ + > ${sample_id}${extension}.bam + """ +} \ No newline at end of file diff --git a/modules/vardict.nf b/modules/vardict.nf new file mode 100644 index 0000000..8087217 --- /dev/null +++ b/modules/vardict.nf @@ -0,0 +1,26 @@ +// Variant calling step using vardict +process VarDict { + tag "$sample_id" + label 'verdict' + + input: + tuple val(sample_id), path(bami) + val extension + + output: + tuple val(sample_id), file("${sample_id}${extension}.vcf") + + """ + vardict \ + -G ${params.ref} \ + -f 0.0005 \ + -N ${sample_id} \ + -b ${bami[1]} \ + -c 1 \ + -S 2 \ + -E 3 \ + -g 4 ${params.bed} \ + | $params.teststrandbias \ + | $params.var2vcf > ${sample_id}${extension}.vcf + """ +} diff --git a/modules/vep.nf b/modules/vep.nf new file mode 100644 index 0000000..ff331a7 --- /dev/null +++ b/modules/vep.nf @@ -0,0 +1,32 @@ +// Annotation step using VEP +process AnnotationVEP { + tag "$sample_id" + label 'vep' + + publishDir "${params.outdir}/${sample_id}", mode: 'copy', overwrite: true + + input: + tuple val(sample_id), path(vcf) + val extension + + output: + tuple val(sample_id), file("${sample_id}${extension}.vcf") + + """ + vep \ + -i ${vcf} \ + -o ${sample_id}${extension}.vcf \ + --cache \ + --dir_cache ${params.cache} \ + --offline \ + --force_overwrite \ + --vcf \ + --numbers \ + --refseq \ + --symbol \ + --hgvs \ + --canonical \ + --max_af \ + --fasta ${params.fasta} + """ +} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 9293def..f46ef73 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,21 +1,81 @@ -docker.enabled = true -conda.enabled = true +manifest { + name = 'MobiCT' + author = '' + homePage = 'https://github.com/SimCab-CHU/MobiCT' + description = 'ctDNA Analysis pipeline' + mainScript = 'MobiCT.nf' + nextflowVersion = '>=22.04.0' + version = '1.0.0' +} + +// meta parameters dag.overwrite = true -process.executor = 'slurm' -workDir = "/path/to/workDir" params { - struct_r1 = "5M2S+T" - struct_r2 = "5M2S+T" - ref = "/path/to/referenceGenome.fasta" - dict = "/path/to/referenceGenome.dict" - fastq = "/path/to/fastq/files/*_{R1,R2}_001.fastq.gz" - filter_fastq = /^((?!Undetermined).)*$/ - outdir = "/output/path" - bed = "/path/to/bedFile" + struct_r1 = "5M2S+T" + struct_r2 = "5M2S+T" + ref = "/path/to/referenceGenome.fasta" + dict = "/path/to/referenceGenome.dict" + fastq = "/path/to/fastq/files/*_{R1,R2}_001.fastq.gz" + filter_fastq = /^((?!Undetermined).)*$/ + outdir = "/path/to/outdir" + bed = "/path/to/bedFile" teststrandbias = "/path/to/teststrandbias.R/used/by/vardict" - var2vcf = "/path/to/var2vcf_valid.pl/used/by/vardict" - cache = "/path/to/cacheVEP" - fasta = "/path/to/fastaVEP" - tmp_dir = "/path/to/a/tmp/dir/" (optional) + var2vcf = "/path/to/var2vcf_valid.pl/used/by/vardict" + cache = "/path/to/cacheVEP" + fasta = "/path/to/fastaVEP" + pipeline_report="pipeline_report" +} + +profiles { + + slurm { + process.executor = 'slurm' + } + + docker { + docker.enabled = true + docker.runOptions='-u "$( id -u ):$( id -g )"' + includeConfig "$baseDir/config/softwares.config" + } + + singularity { + singularity.enabled = true + includeConfig "$baseDir/config/softwares.config" + } + conda { + conda.enabled = true + includeConfig "$baseDir/config/softwares.config" + } + test{ + params { + bed = "$baseDir/Example_data/panel_lung.bed" + fastq = "$baseDir/Example_data/*_{R1,R2}_001.fastq.gz" + outdir = "result_test" + ref = "/path/to/referenceGenome.fasta" + dict = "/path/to/referenceGenome.dict" + } + } +} + +resume = true + +timeline { + enabled = true + file = "${params.pipeline_report}/execution_timeline_${new Date().format('yyyyMMddHHmmss')}.html" +} + +report { + enabled = true + file = "${params.pipeline_report}/execution_report_${new Date().format('yyyyMMddHHmmss')}.html" +} + +trace { + enabled = true + file = "${params.pipeline_report}/execution_trace_${new Date().format('yyyyMMddHHmmss')}.txt" +} + +dag { + enabled = true + file = "${params.pipeline_report}/pipeline_dag.svg" } From 1bdc7420dd71fcd3e6d1c8a30dc1817ce472b0fa Mon Sep 17 00:00:00 2001 From: Jacques Dainat Date: Wed, 2 Apr 2025 21:12:08 +0200 Subject: [PATCH 02/12] fix hpc config file --- config/ressources/hpc.config | 76 ++++++++---------------------------- 1 file changed, 16 insertions(+), 60 deletions(-) diff --git a/config/ressources/hpc.config b/config/ressources/hpc.config index 9156f0c..785ecc4 100644 --- a/config/ressources/hpc.config +++ b/config/ressources/hpc.config @@ -6,87 +6,43 @@ process { stageOutMode = 'rsync' withLabel: 'bcftools' { - cpus = 1 + cpus = 4 time = '1h' } withLabel: 'bwamem' { - cpus = 10 - time = '24h' - } - withLabel: 'bowtie' { - cpus = 16 - time = '4h' - } - withLabel: 'bowtie2' { - cpus = 16 - time = '4h' - } - withLabel: 'bwa' { - cpus = 16 - time = '4h' - } - withName: 'fastp' { - cpus = 16 - time = '2h' - } - withLabel: 'fastqc' { - cpus = 8 - time = '1h' - } - withLabel: 'hisat2' { cpus = 16 time = '4h' } - withLabel: 'kallisto' { - cpus = 16 + withLabel: 'fastp' { + cpus = 4 time = '4h' } - withLabel: 'graphmap2' { - cpus = 16 + withLabel: 'fgbio' { + cpus = 4 time = '4h' } - withLabel: 'minimap2' { - cpus = 16 + withLabel: 'gatk4' { + cpus = 4 time = '4h' } - withLabel: 'multiqc' { - cpus = 8 - time = '1h' - } - withLabel: 'mummer4' { - cpus = 16 - time = '4h' + withName: 'multiqc' { + cpus = 4 + time = '2h' } - withLabel: 'ngmlr' { - cpus = 16 + withLabel: 'picard' { + cpus = 4 time = '1h' } - withLabel: 'novoalign' { - cpus = 16 - time = '4h' - } - withLabel: 'salmon' { - cpus = 16 - time = '4h' - } withLabel: 'samtools' { - cpus = 16 - time = '2h' - } - withLabel: 'seqkit' { cpus = 8 time = '4h' } - withLabel: 'seqtk' { - cpus = 16 + withLabel: 'vardict' { + cpus = 2 time = '4h' } - withLabel: 'star' { - cpus = 16 - time = '4h' - } - withLabel: 'subread' { - cpus = 16 + withLabel: 'vep' { + cpus = 4 time = '4h' } } From 91d3a79977ec9d9d5d1d8f4e25f3d51e6656eb06 Mon Sep 17 00:00:00 2001 From: Jacques Dainat Date: Thu, 3 Apr 2025 18:03:32 +0200 Subject: [PATCH 03/12] add multithreading --- modules/bwamem.nf | 1 + modules/fastp.nf | 1 + modules/vep.nf | 1 + 3 files changed, 3 insertions(+) diff --git a/modules/bwamem.nf b/modules/bwamem.nf index 6a46753..17fded8 100644 --- a/modules/bwamem.nf +++ b/modules/bwamem.nf @@ -14,6 +14,7 @@ process BWAmem { """ bwa mem \ + -t ${task.cpus} \ ${opt_bwa} \ -M ${params.ref} \ ${fastq[0]} \ diff --git a/modules/fastp.nf b/modules/fastp.nf index 2856a8e..c0ade77 100644 --- a/modules/fastp.nf +++ b/modules/fastp.nf @@ -17,6 +17,7 @@ process Fastp { script: """ fastp \ + --thread ${task.cpus} \ -i ${fastq[0]} \ -o ${sample_id}${extension}.R1.fq \ -I ${fastq[1]} \ diff --git a/modules/vep.nf b/modules/vep.nf index ff331a7..d0d8ace 100644 --- a/modules/vep.nf +++ b/modules/vep.nf @@ -14,6 +14,7 @@ process AnnotationVEP { """ vep \ + --fork ${task.cpus} \ -i ${vcf} \ -o ${sample_id}${extension}.vcf \ --cache \ From e6ae5123b8040b0d352bbe33106a7a280886c38f Mon Sep 17 00:00:00 2001 From: Jacques Dainat Date: Thu, 3 Apr 2025 18:13:30 +0200 Subject: [PATCH 04/12] I forgot to commit that one ^^ --- modules/bcftools.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/bcftools.nf b/modules/bcftools.nf index 961c5c2..297d942 100644 --- a/modules/bcftools.nf +++ b/modules/bcftools.nf @@ -12,6 +12,6 @@ process BCFtools_stats { tuple val(sample_id), file("${sample_id}${extension}.stats") """ - bcftools stats ${file} > ${sample_id}${extension}.stats + bcftools stats --threads ${task.cpus} ${file} > ${sample_id}${extension}.stats """ } From aa6a728fe075ff253268aecac3361d19ee1591c3 Mon Sep 17 00:00:00 2001 From: Jacques Dainat Date: Thu, 3 Apr 2025 20:57:59 +0200 Subject: [PATCH 05/12] make genome ref as a channel to check early its existence to avoid proces crash and inform user at the very beginning --- MobiCT.nf | 17 ++++++++++------- modules/bwamem.nf | 3 ++- modules/gatk.nf | 6 ++++-- modules/picard.nf | 3 ++- modules/vardict.nf | 3 ++- 5 files changed, 20 insertions(+), 12 deletions(-) diff --git a/MobiCT.nf b/MobiCT.nf index 259a139..33d0e0c 100644 --- a/MobiCT.nf +++ b/MobiCT.nf @@ -41,16 +41,19 @@ workflow { Channel.fromFilePairs(params.fastq, checkIfExists:true) .filter{ v -> v=~ params.filter_fastq} .set{read_pairs_fastq} + Channel.fromPath(params.ref, checkIfExists: true) + .ifEmpty { exit 1, "Cannot find genome matching ${params.ref}!\n" } + .set{genomeref} // 1. Preprocess deduplication ConvertFastqToSam(read_pairs_fastq, ".1.unmapped") ExtractUmis(ConvertFastqToSam.out, params.struct_r1, params.struct_r2, ".1.umi_extracted") ConvertSamToFastq(ExtractUmis.out, ".1.umi_extracted") Fastp(ConvertSamToFastq.out, ".1.umi_extracted.trimmed") - BWAmem(Fastp.out[0], "-t 10", ".1.umi_extracted.aligned") + BWAmem(Fastp.out[0], genomeref, "-t 10", ".1.umi_extracted.aligned") BWAmem.out.join(ExtractUmis.out).set{bams_umis} - MergeBam(bams_umis, ".1.merged") + MergeBam(bams_umis, genomeref, ".1.merged") UmiMergeFilt(MergeBam.out, ".1.filtered") // 2. Process deduplication @@ -59,21 +62,21 @@ workflow { // 3. Post process deduplication RerunConvertSamToFastq(CallConsensus.out, ".3.unmapped") - RerunBWAmem(RerunConvertSamToFastq.out, "-t 10 -Y", ".3.consensus_mapped") + RerunBWAmem(RerunConvertSamToFastq.out, genomeref, "-t 10 -Y", ".3.consensus_mapped") SortConsensus(CallConsensus.out, ".3.unmapped") RerunSortConsensus(RerunBWAmem.out, ".3.mapped") RerunSortConsensus.out.join(SortConsensus.out).set{bams_consensus} - MergeBam2(bams_consensus, ".3.merged") + MergeBam2(bams_consensus, genomeref, ".3.merged") // 4. Variant Calling & Annotation - VarDict(MergeBam2.out, ".4.vardict") + VarDict(MergeBam2.out, genomeref, ".4.vardict") AnnotationVEP(VarDict.out, ".4.vardict.vep") // Quality Controls BedToIntervalList(params.dict, params.bed, params.bed.tokenize('.')[0].tokenize('/')[-1]) - CollectHsMetrics(BWAmem.out, BedToIntervalList.out, ".QC.HsMetrics.1") - RerunCollectHsMetrics(RerunBWAmem.out, BedToIntervalList.out, ".QC.HsMetrics.3") + CollectHsMetrics(BWAmem.out, BedToIntervalList.out, genomeref, ".QC.HsMetrics.1") + RerunCollectHsMetrics(RerunBWAmem.out, BedToIntervalList.out, genomeref, ".QC.HsMetrics.3") BCFtools_stats(VarDict.out, ".QC.bcftools_stats") MultiQC(BCFtools_stats.out, ".QC.multiQC") diff --git a/modules/bwamem.nf b/modules/bwamem.nf index 17fded8..5a9f779 100644 --- a/modules/bwamem.nf +++ b/modules/bwamem.nf @@ -6,6 +6,7 @@ process BWAmem { input: tuple val(sample_id), path(fastq) + path genomeref val opt_bwa val extension @@ -16,7 +17,7 @@ process BWAmem { bwa mem \ -t ${task.cpus} \ ${opt_bwa} \ - -M ${params.ref} \ + -M ${genomeref} \ ${fastq[0]} \ ${fastq[1]} \ | \ diff --git a/modules/gatk.nf b/modules/gatk.nf index 8050325..fc24f32 100644 --- a/modules/gatk.nf +++ b/modules/gatk.nf @@ -52,6 +52,7 @@ process MergeBam { input: tuple val(sample_id), path(bam_aligned), path(bam_unmapped) + path genomeref val extension @@ -66,7 +67,7 @@ process MergeBam { --ALIGNED_BAM ${bam_aligned} \ --UNMAPPED_BAM ${bam_unmapped} \ --OUTPUT ${sample_id}${extension}.bam \ - --REFERENCE_SEQUENCE ${params.ref} \ + --REFERENCE_SEQUENCE ${genomeref} \ --SORT_ORDER 'queryname' \ --ALIGNED_READS_ONLY true \ --MAX_INSERTIONS_OR_DELETIONS -1 \ @@ -109,6 +110,7 @@ process MergeBam2 { input: tuple val(sample_id), path(bam_aligned), path(bam_unmapped) + path genomeref val extension output: @@ -121,7 +123,7 @@ process MergeBam2 { --ALIGNED_BAM ${bam_aligned} \ --UNMAPPED_BAM ${bam_unmapped} \ --OUTPUT ${sample_id}${extension}.bam \ - --REFERENCE_SEQUENCE ${params.ref} \ + --REFERENCE_SEQUENCE ${genomeref} \ --SORT_ORDER coordinate \ --ADD_MATE_CIGAR true \ --MAX_INSERTIONS_OR_DELETIONS -1 \ diff --git a/modules/picard.nf b/modules/picard.nf index 798e6d1..d37316f 100644 --- a/modules/picard.nf +++ b/modules/picard.nf @@ -27,6 +27,7 @@ process CollectHsMetrics { input: tuple val(sample_id), file(bam) file bed + path genomeref val extension output: @@ -34,7 +35,7 @@ process CollectHsMetrics { """ picard CollectHsMetrics \ - --REFERENCE_SEQUENCE ${params.ref} \ + --REFERENCE_SEQUENCE ${genomeref} \ --BAIT_INTERVALS ${bed} \ --TARGET_INTERVALS ${bed} \ --INPUT ${bam} \ diff --git a/modules/vardict.nf b/modules/vardict.nf index 8087217..c0a4335 100644 --- a/modules/vardict.nf +++ b/modules/vardict.nf @@ -5,6 +5,7 @@ process VarDict { input: tuple val(sample_id), path(bami) + path genomeref val extension output: @@ -12,7 +13,7 @@ process VarDict { """ vardict \ - -G ${params.ref} \ + -G ${genomeref} \ -f 0.0005 \ -N ${sample_id} \ -b ${bami[1]} \ From 5fecc31ee8ded28a5311e6f9e4031b2dc8156d92 Mon Sep 17 00:00:00 2001 From: Jacques Dainat Date: Thu, 3 Apr 2025 20:58:34 +0200 Subject: [PATCH 06/12] add test for profile existence at the very beginning --- MobiCT.nf | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/MobiCT.nf b/MobiCT.nf index 33d0e0c..173be62 100644 --- a/MobiCT.nf +++ b/MobiCT.nf @@ -23,6 +23,14 @@ email = 's-cabelloaguilar@chu-montpellier.fr' ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ +// Check profiles +if ( + workflow.profile.contains('singularity') || + workflow.profile.contains('docker') || + workflow.profile.contains('conda') + ) { "executer selected" } +else { exit 1, "No executer selected, please use: -profile docker/singularity/conda (choose one of the 3 possibilities)"} + // Include modules include {ConvertFastqToSam; MergeBam; MergeBam2; SortConsensus; SortConsensus as RerunSortConsensus; ConvertSamToFastq; ConvertSamToFastq as RerunConvertSamToFastq } from "$baseDir/modules/gatk.nf" include {BedToIntervalList; CollectHsMetrics; CollectHsMetrics as RerunCollectHsMetrics;} from "$baseDir/modules/picard.nf" From 341f9feb4a7095b60e8858e3d59e91a2007e8c9d Mon Sep 17 00:00:00 2001 From: Jacques Dainat Date: Tue, 8 Apr 2025 13:55:00 +0200 Subject: [PATCH 07/12] add pigz --- config/ressources/conda/pigz.yaml | 7 +++++++ modules/pigz.nf | 29 +++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) create mode 100644 config/ressources/conda/pigz.yaml create mode 100644 modules/pigz.nf diff --git a/config/ressources/conda/pigz.yaml b/config/ressources/conda/pigz.yaml new file mode 100644 index 0000000..e18ebd6 --- /dev/null +++ b/config/ressources/conda/pigz.yaml @@ -0,0 +1,7 @@ +name: gatk4 +channels: + - conda-forge + - bioconda + +dependencies: + - pigz=2.8 \ No newline at end of file diff --git a/modules/pigz.nf b/modules/pigz.nf new file mode 100644 index 0000000..25bb466 --- /dev/null +++ b/modules/pigz.nf @@ -0,0 +1,29 @@ +// +process DealWithRef { + tag "$genome_fasta" + label 'pigz' + + input: + path(genome_fasta) + + output: + path "*.fa", emit: genomerefok + + script: + genomeReady = genome_fasta.baseName.replaceAll(/\.(gz)$/, '') // remove .gz + genomeReady = genomeReady.replaceAll(/\.(fasta|fa)$/, '') // remove .fasta or .fa + genomeFa = genomeReady + ".fa" + """ + + + # DEALING WITH GZIPPED FILES to uncompress if needed + extension=\$(echo ${genome_fasta} | awk -F . '{print \$NF}') + + if [[ \${extension} == "gz" ]];then + pigz -dck -p ${task.cpus} ${genome_fasta} > ${genomeFa} + else + # copy fa + ln -s ${genome_fasta} ${genomeFa} + fi + """ +} \ No newline at end of file From cb5874f10b0a53079dc9b2bc0b58d106889f46ee Mon Sep 17 00:00:00 2001 From: Jacques Dainat Date: Tue, 8 Apr 2025 13:58:57 +0200 Subject: [PATCH 08/12] add CI with config for 2 cpus --- .github/workflows/ci.yml | 62 +++++++++++++++++++++++++++++++++++++ config/ressources/ci.config | 56 +++++++++++++++++++++++++++++++++ 2 files changed, 118 insertions(+) create mode 100644 .github/workflows/ci.yml create mode 100644 config/ressources/ci.config diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..fff23c8 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,62 @@ +# This is a basic workflow to help you get started with Actions + +name: CI + +# Controls when the action will run. +# Triggers the workflow on push or pull request events +on: [push, pull_request, workflow_dispatch] + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +# A workflow run is made up of one or more jobs that can run sequentially or in parallel +jobs: + + # This workflow contains a second job called "build2" + test_MobiCT: + # avoid to run twice push and PR + if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name + + # The type of runner that the job will run on + runs-on: ubuntu-24.04 + services: + docker: + image: docker:dind + options: --privileged --shm-size=2g + volumes: + - /var/run/docker.sock:/var/run/docker.sock:ro + + # Steps represent a sequence of tasks that will be executed as part of the job + steps: + # Install/cache OpenJDK + - name: Cache OpenJDK + id: cache-openjdk + uses: actions/cache@v3 + with: + path: /usr/lib/jvm/java-11-openjdk-amd64 + key: ${{ runner.os }}-openjdk-11 + restore-keys: | + ${{ runner.os }}-openjdk-11 + - name: Install openjdk + if: ${{ steps.cache-openjdk.outputs.cache-hit != 'true' }} + run: sudo apt-get install openjdk-11-jdk + # Install/cache nextflow + - name: Cache nextflow + id: cache-nextflow + uses: actions/cache@v3 + with: + path: /usr/local/bin/nextflow + key: ${{ runner.os }}-nextflow + restore-keys: | + ${{ runner.os }}-nextflow + - name: Install Nextflox + if: ${{ steps.cache-nextflow.outputs.cache-hit != 'true' }} + run: curl -s https://get.nextflow.io | bash && mv nextflow /usr/local/bin && chmod +x /usr/local/bin/nextflow + + # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it + - uses: actions/checkout@v3 + # Run tests sequentialy + - name: test + run: nextflow run -ansi-log -profile docker,test,ci MobiCT.nf + diff --git a/config/ressources/ci.config b/config/ressources/ci.config new file mode 100644 index 0000000..8c79032 --- /dev/null +++ b/config/ressources/ci.config @@ -0,0 +1,56 @@ +process { + cpus = 1 + time = '1h' + maxForks = 8 + shell = ['/bin/bash', '-euo', 'pipefail'] + stageOutMode = 'rsync' + + withLabel: 'bcftools' { + cpus = 2 + time = '1h' + } + withLabel: 'BwaIndex' { + cpus = 1 + time = '4h' + } + withLabel: 'BWAmem' { + cpus = 2 + time = '4h' + } + withLabel: 'fastp' { + cpus = 2 + time = '4h' + } + withLabel: 'fgbio' { + cpus = 2 + time = '4h' + } + withLabel: 'gatk4' { + cpus = 2 + time = '4h' + } + withLabel: 'pigz' { + cpus = 2 + time = '4h' + } + withLabel: 'multiqc' { + cpus = 2 + time = '2h' + } + withLabel: 'picard' { + cpus = 2 + time = '1h' + } + withLabel: 'samtools' { + cpus = 2 + time = '4h' + } + withLabel: 'vardict' { + cpus = 1 + time = '4h' + } + withLabel: 'vep' { + cpus = 2 + time = '4h' + } +} From dfa02ce03b9cf3646c0f582d75e6c9fdb4a9c5c9 Mon Sep 17 00:00:00 2001 From: Jacques Dainat Date: Tue, 8 Apr 2025 14:01:13 +0200 Subject: [PATCH 09/12] simplify options, deal with gz ref, use channesl more than params --- MobiCT.nf | 58 +++++++++++++++++----------- config/ressources/conda/vardict.yaml | 2 +- config/ressources/hpc.config | 12 +++++- config/ressources/local.config | 14 +++++-- config/softwares.config | 9 ++++- modules/bwamem.nf | 26 +++++++++++-- modules/fgbio.nf | 29 ++++++++------ modules/gatk.nf | 3 +- modules/multiqc.nf | 17 +------- modules/picard.nf | 15 +++++++ modules/samtools.nf | 34 ++++++++++++++++ modules/vardict.nf | 12 +++--- modules/vep.nf | 10 ++--- nextflow.config | 27 +++++++------ 14 files changed, 182 insertions(+), 86 deletions(-) diff --git a/MobiCT.nf b/MobiCT.nf index 173be62..0106c7b 100644 --- a/MobiCT.nf +++ b/MobiCT.nf @@ -33,13 +33,14 @@ else { exit 1, "No executer selected, please use: -profile docker/singularity/co // Include modules include {ConvertFastqToSam; MergeBam; MergeBam2; SortConsensus; SortConsensus as RerunSortConsensus; ConvertSamToFastq; ConvertSamToFastq as RerunConvertSamToFastq } from "$baseDir/modules/gatk.nf" -include {BedToIntervalList; CollectHsMetrics; CollectHsMetrics as RerunCollectHsMetrics;} from "$baseDir/modules/picard.nf" -include {BWAmem; BWAmem as RerunBWAmem;} from "$baseDir/modules/bwamem.nf" +include {CreateSequenceDictionary; BedToIntervalList; CollectHsMetrics; CollectHsMetrics as RerunCollectHsMetrics;} from "$baseDir/modules/picard.nf" +include {BwaIndex; BWAmem; BWAmem as RerunBWAmem;} from "$baseDir/modules/bwamem.nf" include {BCFtools_stats} from "$baseDir/modules/bcftools.nf" include {Fastp} from "$baseDir/modules/fastp.nf" include {ExtractUmis; GroupReads; CallConsensus; } from "$baseDir/modules/fgbio.nf" -include {MultiQC; MultiQC_ALL; } from "$baseDir/modules/multiqc.nf" -include {UmiMergeFilt} from "$baseDir/modules/samtools.nf" +include {MultiQC; MultiQC as MultiQC_ALL; } from "$baseDir/modules/multiqc.nf" +include {Faidx; UmiMergeFilt; Sam2bam; Sam2bam as ReSam2bam} from "$baseDir/modules/samtools.nf" +include {DealWithRef} from "$baseDir/modules/pigz.nf" include {VarDict} from "$baseDir/modules/vardict.nf" include {AnnotationVEP} from "$baseDir/modules/vep.nf" @@ -53,45 +54,56 @@ workflow { .ifEmpty { exit 1, "Cannot find genome matching ${params.ref}!\n" } .set{genomeref} + + // htslib-based tools (e.g., samtools, bcftools) Require BGZF-compressed FASTA and Use .fai index (samtools faidx) + DealWithRef(genomeref.collect()) + DealWithRef.out.genomerefok.set{genomerefok} + + Faidx(genomerefok).set{genome_fai} + + // Create the genome index + BwaIndex(genomerefok) + + // Picard/GATK can use gzipped compressed fasta but need a .dict file + // Create the genome dictionary + CreateSequenceDictionary(genomerefok) + .set{genome_dict} + // 1. Preprocess deduplication ConvertFastqToSam(read_pairs_fastq, ".1.unmapped") ExtractUmis(ConvertFastqToSam.out, params.struct_r1, params.struct_r2, ".1.umi_extracted") ConvertSamToFastq(ExtractUmis.out, ".1.umi_extracted") Fastp(ConvertSamToFastq.out, ".1.umi_extracted.trimmed") - BWAmem(Fastp.out[0], genomeref, "-t 10", ".1.umi_extracted.aligned") + BWAmem(Fastp.out[0], genomerefok, BwaIndex.out, "", ".1.umi_extracted.aligned") + Sam2bam(BWAmem.out.tuple_sample_sam) - BWAmem.out.join(ExtractUmis.out).set{bams_umis} - MergeBam(bams_umis, genomeref, ".1.merged") + Sam2bam.out.join(ExtractUmis.out).set{bams_umis} + MergeBam(bams_umis, genomerefok, genome_dict, ".1.merged") UmiMergeFilt(MergeBam.out, ".1.filtered") // 2. Process deduplication GroupReads(UmiMergeFilt.out, ".2.umi_grouped") - CallConsensus(GroupReads.out.nextout, ".2.consensus_unmapped") + CallConsensus(GroupReads.out.nextout, genome_fai, ".2.consensus_unmapped") // 3. Post process deduplication RerunConvertSamToFastq(CallConsensus.out, ".3.unmapped") - RerunBWAmem(RerunConvertSamToFastq.out, genomeref, "-t 10 -Y", ".3.consensus_mapped") + RerunBWAmem(RerunConvertSamToFastq.out, genomerefok, BwaIndex.out, " -Y", ".3.consensus_mapped") SortConsensus(CallConsensus.out, ".3.unmapped") - RerunSortConsensus(RerunBWAmem.out, ".3.mapped") + ReSam2bam(RerunBWAmem.out.tuple_sample_sam) + RerunSortConsensus(ReSam2bam.out, ".3.mapped") RerunSortConsensus.out.join(SortConsensus.out).set{bams_consensus} - MergeBam2(bams_consensus, genomeref, ".3.merged") + MergeBam2(bams_consensus, genomerefok, genome_dict, ".3.merged") // 4. Variant Calling & Annotation - VarDict(MergeBam2.out, genomeref, ".4.vardict") - AnnotationVEP(VarDict.out, ".4.vardict.vep") + VarDict(MergeBam2.out, genomerefok, genome_fai, params.bed,".4.vardict") + AnnotationVEP(VarDict.out, genomerefok, ".4.vardict.vep") // Quality Controls - BedToIntervalList(params.dict, params.bed, params.bed.tokenize('.')[0].tokenize('/')[-1]) - CollectHsMetrics(BWAmem.out, BedToIntervalList.out, genomeref, ".QC.HsMetrics.1") - RerunCollectHsMetrics(RerunBWAmem.out, BedToIntervalList.out, genomeref, ".QC.HsMetrics.3") + BedToIntervalList(genome_dict, params.bed, params.bed.tokenize('.')[0].tokenize('/')[-1]) + CollectHsMetrics(Sam2bam.out, BedToIntervalList.out, genomerefok, genome_fai, ".QC.HsMetrics.1") + RerunCollectHsMetrics(ReSam2bam.out, BedToIntervalList.out, genomerefok, genome_fai, ".QC.HsMetrics.3") BCFtools_stats(VarDict.out, ".QC.bcftools_stats") MultiQC(BCFtools_stats.out, ".QC.multiQC") - - Channel.empty() - .mix( MultiQC.out ) - .map { sample, files -> files } - .collect() - .set { log_files } - MultiQC_ALL(log_files, "all.QC.multiQC") + MultiQC_ALL(BCFtools_stats.out.collect(), "all.QC.multiQC") } diff --git a/config/ressources/conda/vardict.yaml b/config/ressources/conda/vardict.yaml index 9af8e25..b6ab698 100644 --- a/config/ressources/conda/vardict.yaml +++ b/config/ressources/conda/vardict.yaml @@ -4,4 +4,4 @@ channels: - bioconda dependencies: - - vardict=2018.10.18 \ No newline at end of file + - vardict-java=1.8.3 \ No newline at end of file diff --git a/config/ressources/hpc.config b/config/ressources/hpc.config index 785ecc4..1be18d1 100644 --- a/config/ressources/hpc.config +++ b/config/ressources/hpc.config @@ -9,7 +9,11 @@ process { cpus = 4 time = '1h' } - withLabel: 'bwamem' { + withName: 'BwaIndex' { + cpus = 1 + time = '4h' + } + withName: 'BWAmem' { cpus = 16 time = '4h' } @@ -25,6 +29,10 @@ process { cpus = 4 time = '4h' } + withLabel: 'pigz' { + cpus = 8 + time = '4h' + } withName: 'multiqc' { cpus = 4 time = '2h' @@ -38,7 +46,7 @@ process { time = '4h' } withLabel: 'vardict' { - cpus = 2 + cpus = 1 time = '4h' } withLabel: 'vep' { diff --git a/config/ressources/local.config b/config/ressources/local.config index 9774c2c..5deefbc 100644 --- a/config/ressources/local.config +++ b/config/ressources/local.config @@ -9,7 +9,11 @@ process { cpus = 4 time = '1h' } - withLabel: 'bwamem' { + withName: 'BwaIndex' { + cpus = 1 + time = '4h' + } + withName: 'BWAmem' { cpus = 4 time = '4h' } @@ -25,7 +29,11 @@ process { cpus = 4 time = '4h' } - withName: 'multiqc' { + withLabel: 'pigz' { + cpus = 4 + time = '4h' + } + withLabel: 'multiqc' { cpus = 4 time = '2h' } @@ -38,7 +46,7 @@ process { time = '4h' } withLabel: 'vardict' { - cpus = 2 + cpus = 1 time = '4h' } withLabel: 'vep' { diff --git a/config/softwares.config b/config/softwares.config index a3564c8..05bbf04 100644 --- a/config/softwares.config +++ b/config/softwares.config @@ -34,6 +34,11 @@ process { container = { singularity.enabled ? 'https://depot.galaxyproject.org/singularity/picard:3.3.0--hdfd78af_0' : docker.enabled ? 'quay.io/biocontainers/picard:3.3.0--hdfd78af_0' : '' } } + withLabel: 'pigz' { + conda = { singularity.enabled || docker.enabled ? '' : "$baseDir/config/ressources/conda/pigz.yaml" } + container = { singularity.enabled ? 'https://depot.galaxyproject.org/singularity/pigz:2.8' : + docker.enabled ? 'quay.io/biocontainers/pigz:2.8' : '' } + } withLabel: 'samtools' { conda = { singularity.enabled || docker.enabled ? '' : "$baseDir/config/ressources/conda/samtools.yaml" } container = { singularity.enabled ? 'https://depot.galaxyproject.org/singularity/samtools:1.6--h5fe306e_11' : @@ -41,8 +46,8 @@ process { } withLabel: 'vardict' { conda = { singularity.enabled || docker.enabled ? '' : "$baseDir/config/ressources/conda/vardict.yaml" } - container = { singularity.enabled ? 'https://depot.galaxyproject.org/singularity/vardict:2018.10.18--pl526_1' : - docker.enabled ? 'quay.io/biocontainers/vardict:2018.10.18--pl526_1' : '' } + container = { singularity.enabled ? 'https://depot.galaxyproject.org/singularity/vardict-java:1.8.3--hdfd78af_0' : + docker.enabled ? 'quay.io/biocontainers/vardict-java:1.8.3--hdfd78af_0' : '' } } withLabel: 'vep' { conda = { singularity.enabled || docker.enabled ? '' : "$baseDir/config/ressources/conda/ensembl-vep.yaml" } diff --git a/modules/bwamem.nf b/modules/bwamem.nf index 5a9f779..ccf6d8b 100644 --- a/modules/bwamem.nf +++ b/modules/bwamem.nf @@ -1,3 +1,19 @@ +process BwaIndex { + label 'bwamem' + tag "$genome_fasta" + + input: + path(genome_fasta) + + output: + path("*") + + script: + """ + bwa index $genome_fasta -p ${genome_fasta.baseName} + """ +} + // Alignement before deduplication: Align quality and adapter trimmed reads to // the reference genome process BWAmem { @@ -7,20 +23,22 @@ process BWAmem { input: tuple val(sample_id), path(fastq) path genomeref + path bwa_index_files val opt_bwa val extension output: - tuple val(sample_id), file("${sample_id}${extension}.bam") + tuple val(sample_id), file("${sample_id}${extension}.sam"), emit: tuple_sample_sam + path "*.log", emit: bwamem_summary """ bwa mem \ -t ${task.cpus} \ ${opt_bwa} \ - -M ${genomeref} \ + -M \ + ${genomeref.baseName} \ ${fastq[0]} \ ${fastq[1]} \ - | \ - samtools view -bh -o ${sample_id}${extension}.bam + > ${sample_id}${extension}.sam 2> ${sample_id}${extension}.log """ } \ No newline at end of file diff --git a/modules/fgbio.nf b/modules/fgbio.nf index cd9b802..227a1b7 100644 --- a/modules/fgbio.nf +++ b/modules/fgbio.nf @@ -12,6 +12,7 @@ process ExtractUmis { output: tuple val(sample_id), file("${sample_id}${extension}.bam") + script: """ fgbio ExtractUmisFromBam \ -i ${bam_file} \ @@ -26,7 +27,7 @@ process ExtractUmis { // The user can control how many errors/mismatches are allowed in the UMI sequence when assigning source molecules (--edits=n). process GroupReads { tag "$sample_id" - + label 'fgbio' publishDir "${params.outdir}/${sample_id}", mode: 'copy', overwrite: true, pattern: '*.txt' input: @@ -37,6 +38,7 @@ process GroupReads { tuple val(sample_id), file("${sample_id}${extension}.bam"), emit: nextout file "${sample_id}.QC.family_size_counts.txt" + script: """ fgbio GroupReadsByUmi \ -i ${bam} \ @@ -54,23 +56,26 @@ process GroupReads { // to 1, in so doing the single read will be considered the consensus. process CallConsensus { tag "$sample_id" + label 'fgbio' input: tuple val(sample_id), path(bam) + path genome_fai val extension output: tuple val(sample_id), file("${sample_id}${extension}.bam") - """ - fgbio CallMolecularConsensusReads \ - -i ${bam} \ - -o ${sample_id}${extension}.bam \ - --error-rate-post-umi 40 \ - --error-rate-pre-umi 45 \ - --output-per-base-tags false \ - --min-reads 2 \ - --max-reads 50 \ - --read-name-prefix='consensus' - """ + script: + """ + fgbio CallMolecularConsensusReads \ + -i ${bam} \ + -o ${sample_id}${extension}.bam \ + --error-rate-post-umi 40 \ + --error-rate-pre-umi 45 \ + --output-per-base-tags false \ + --min-reads 2 \ + --max-reads 50 \ + --read-name-prefix='consensus' + """ } \ No newline at end of file diff --git a/modules/gatk.nf b/modules/gatk.nf index fc24f32..10f00cd 100644 --- a/modules/gatk.nf +++ b/modules/gatk.nf @@ -16,7 +16,6 @@ process ConvertFastqToSam { --FASTQ2 ${fastq[1]} \ --OUTPUT ${sample_id}${extension}.bam \ --SAMPLE_NAME ${sample_id} \ - --TMP_DIR ${params.tmp_dir} """ } @@ -53,6 +52,7 @@ process MergeBam { input: tuple val(sample_id), path(bam_aligned), path(bam_unmapped) path genomeref + path genome_dict val extension @@ -111,6 +111,7 @@ process MergeBam2 { input: tuple val(sample_id), path(bam_aligned), path(bam_unmapped) path genomeref + path genome_dict val extension output: diff --git a/modules/multiqc.nf b/modules/multiqc.nf index 2c10719..d1627d1 100644 --- a/modules/multiqc.nf +++ b/modules/multiqc.nf @@ -14,21 +14,6 @@ process MultiQC { tuple val(sample_id), file("${sample_id}${extension}") """ - multiqc ${params.outdir}/${sample_id} -o ${sample_id}${extension} - """ -} - -process MultiQC_ALL { - publishDir "${params.outdir}", mode: 'copy', overwrite: true - - input: - tuple val(sample_id), path(file) - val extension - - output: - file("${extension}") - - """ - multiqc ${params.outdir} -o ${extension} + multiqc -p . -o ${sample_id}${extension} """ } \ No newline at end of file diff --git a/modules/picard.nf b/modules/picard.nf index d37316f..e638538 100644 --- a/modules/picard.nf +++ b/modules/picard.nf @@ -1,3 +1,17 @@ +process CreateSequenceDictionary { + label 'picard' + + input: + path genome + + output: + file "*.dict" + + """ + picard CreateSequenceDictionary R=${genome} O=${genome}.dict + """ +} + process BedToIntervalList { label 'picard' @@ -28,6 +42,7 @@ process CollectHsMetrics { tuple val(sample_id), file(bam) file bed path genomeref + path genome_fai val extension output: diff --git a/modules/samtools.nf b/modules/samtools.nf index 435f7c9..6954802 100644 --- a/modules/samtools.nf +++ b/modules/samtools.nf @@ -1,3 +1,19 @@ +// +process Faidx { + tag "$genome_fasta" + label 'samtools' + + input: + path(genome_fasta) + + output: + path("*") + + """ + samtools faidx $genome_fasta + """ +} + // process UmiMergeFilt { tag "$sample_id" @@ -16,4 +32,22 @@ process UmiMergeFilt { -bh ${bam} \ > ${sample_id}${extension}.bam """ +} + +process Sam2bam { + label 'samtools' + tag "$sample" + + input: + tuple val(sample), path(sam) + + output: + tuple val(sample), path ("*.bam"), emit: tuple_sample_bam + + script: + + """ + samtools view -@ ${task.cpus} ${sam} -bh -o ${sam.baseName}.bam + """ + } \ No newline at end of file diff --git a/modules/vardict.nf b/modules/vardict.nf index c0a4335..f6e6e46 100644 --- a/modules/vardict.nf +++ b/modules/vardict.nf @@ -1,18 +1,20 @@ // Variant calling step using vardict process VarDict { tag "$sample_id" - label 'verdict' + label 'vardict' input: tuple val(sample_id), path(bami) path genomeref + path genomeref_fai + path bed val extension output: tuple val(sample_id), file("${sample_id}${extension}.vcf") """ - vardict \ + vardict-java \ -G ${genomeref} \ -f 0.0005 \ -N ${sample_id} \ @@ -20,8 +22,8 @@ process VarDict { -c 1 \ -S 2 \ -E 3 \ - -g 4 ${params.bed} \ - | $params.teststrandbias \ - | $params.var2vcf > ${sample_id}${extension}.vcf + -g 4 ${bed} \ + | teststrandbias.R \ + | var2vcf_valid.pl > ${sample_id}${extension}.vcf """ } diff --git a/modules/vep.nf b/modules/vep.nf index d0d8ace..c95075e 100644 --- a/modules/vep.nf +++ b/modules/vep.nf @@ -7,19 +7,20 @@ process AnnotationVEP { input: tuple val(sample_id), path(vcf) + path genomeref val extension output: tuple val(sample_id), file("${sample_id}${extension}.vcf") + script: + def config = params.cache?.trim() ? "--offline --cache --dir_cache ${params.cache} --max_af " : "--database" """ vep \ + ${config} \ --fork ${task.cpus} \ -i ${vcf} \ -o ${sample_id}${extension}.vcf \ - --cache \ - --dir_cache ${params.cache} \ - --offline \ --force_overwrite \ --vcf \ --numbers \ @@ -27,7 +28,6 @@ process AnnotationVEP { --symbol \ --hgvs \ --canonical \ - --max_af \ - --fasta ${params.fasta} + --fasta ${genomeref} """ } \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index f46ef73..b5f4e35 100644 --- a/nextflow.config +++ b/nextflow.config @@ -12,23 +12,21 @@ manifest { dag.overwrite = true params { - struct_r1 = "5M2S+T" - struct_r2 = "5M2S+T" - ref = "/path/to/referenceGenome.fasta" - dict = "/path/to/referenceGenome.dict" + // main params + ref = "/path/to/referenceGenome.fasta" // If you’re using a VEP cache (e.g. Ensembl’s), use the matching reference FASTA from Ensembl fastq = "/path/to/fastq/files/*_{R1,R2}_001.fastq.gz" - filter_fastq = /^((?!Undetermined).)*$/ outdir = "/path/to/outdir" bed = "/path/to/bedFile" - teststrandbias = "/path/to/teststrandbias.R/used/by/vardict" - var2vcf = "/path/to/var2vcf_valid.pl/used/by/vardict" - cache = "/path/to/cacheVEP" - fasta = "/path/to/fastaVEP" + cache = "" // Path to the VEP cache. If you’re using a VEP cache, use the matching reference FASTA from Ensembl: + + // secondary params + struct_r1 = "5M2S+T" + struct_r2 = "5M2S+T" + filter_fastq = /^((?!Undetermined).)*$/ pipeline_report="pipeline_report" } profiles { - slurm { process.executor = 'slurm' } @@ -47,13 +45,18 @@ profiles { conda.enabled = true includeConfig "$baseDir/config/softwares.config" } + local { + includeConfig "$baseDir/config/ressources/local.config" + } + ci { + includeConfig "$baseDir/config/ressources/ci.config" + } test{ params { bed = "$baseDir/Example_data/panel_lung.bed" fastq = "$baseDir/Example_data/*_{R1,R2}_001.fastq.gz" outdir = "result_test" - ref = "/path/to/referenceGenome.fasta" - dict = "/path/to/referenceGenome.dict" + ref = "https://ftp.ensembl.org/pub/release-113/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.7.fa.gz" } } } From 2c48e98befbc12475f8dbec8b1daa92b7bff3843 Mon Sep 17 00:00:00 2001 From: Jacques Dainat Date: Tue, 8 Apr 2025 14:11:08 +0200 Subject: [PATCH 10/12] fix ID to be similar to ensembl --- Example_data/panel_lung.bed | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/Example_data/panel_lung.bed b/Example_data/panel_lung.bed index 621c4a5..fe0c88b 100644 --- a/Example_data/panel_lung.bed +++ b/Example_data/panel_lung.bed @@ -1,8 +1,5 @@ -chr12 25245268 25245388 NM_033360 1 -chr15 90088586 90088747 NM_002168 1 -chr2 208248368 208248660 NM_005896 1 -chr7 55173921 55174041 NM_005228 1 -chr7 55174711 55174831 NM_005228 1 -chr7 55181292 55181478 NM_005228 1 -chr7 55191718 55191874 NM_005228 1 -chr7 140753274 140753394 NM_004333 1 +7 55173921 55174041 NM_005228 1 +7 55174711 55174831 NM_005228 1 +7 55181292 55181478 NM_005228 1 +7 55191718 55191874 NM_005228 1 +7 140753274 140753394 NM_004333 1 From 9d9da31d6bb0b1e515f3e1dcda25cabe6ed0d3d4 Mon Sep 17 00:00:00 2001 From: Jacques Dainat Date: Tue, 8 Apr 2025 14:11:40 +0200 Subject: [PATCH 11/12] fix maxfork --- config/ressources/ci.config | 2 +- config/ressources/local.config | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/config/ressources/ci.config b/config/ressources/ci.config index 8c79032..968c27c 100644 --- a/config/ressources/ci.config +++ b/config/ressources/ci.config @@ -1,7 +1,7 @@ process { cpus = 1 time = '1h' - maxForks = 8 + maxForks = 2 shell = ['/bin/bash', '-euo', 'pipefail'] stageOutMode = 'rsync' diff --git a/config/ressources/local.config b/config/ressources/local.config index 5deefbc..327eeca 100644 --- a/config/ressources/local.config +++ b/config/ressources/local.config @@ -1,7 +1,7 @@ process { cpus = 1 time = '1h' - maxForks = 8 + maxForks = 4 shell = ['/bin/bash', '-euo', 'pipefail'] stageOutMode = 'rsync' From 18a5fe1146af26d6cc69859b00a309719f6ff3f3 Mon Sep 17 00:00:00 2001 From: Jacques Dainat Date: Tue, 8 Apr 2025 14:12:07 +0200 Subject: [PATCH 12/12] add comment --- nextflow.config | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nextflow.config b/nextflow.config index b5f4e35..c539348 100644 --- a/nextflow.config +++ b/nextflow.config @@ -45,9 +45,11 @@ profiles { conda.enabled = true includeConfig "$baseDir/config/softwares.config" } + // Use maximum 4 cpu per process local { includeConfig "$baseDir/config/ressources/local.config" } + // Use maximum 2 cpu per process ci { includeConfig "$baseDir/config/ressources/ci.config" }