diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..fff23c8 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,62 @@ +# This is a basic workflow to help you get started with Actions + +name: CI + +# Controls when the action will run. +# Triggers the workflow on push or pull request events +on: [push, pull_request, workflow_dispatch] + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +# A workflow run is made up of one or more jobs that can run sequentially or in parallel +jobs: + + # This workflow contains a second job called "build2" + test_MobiCT: + # avoid to run twice push and PR + if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name + + # The type of runner that the job will run on + runs-on: ubuntu-24.04 + services: + docker: + image: docker:dind + options: --privileged --shm-size=2g + volumes: + - /var/run/docker.sock:/var/run/docker.sock:ro + + # Steps represent a sequence of tasks that will be executed as part of the job + steps: + # Install/cache OpenJDK + - name: Cache OpenJDK + id: cache-openjdk + uses: actions/cache@v3 + with: + path: /usr/lib/jvm/java-11-openjdk-amd64 + key: ${{ runner.os }}-openjdk-11 + restore-keys: | + ${{ runner.os }}-openjdk-11 + - name: Install openjdk + if: ${{ steps.cache-openjdk.outputs.cache-hit != 'true' }} + run: sudo apt-get install openjdk-11-jdk + # Install/cache nextflow + - name: Cache nextflow + id: cache-nextflow + uses: actions/cache@v3 + with: + path: /usr/local/bin/nextflow + key: ${{ runner.os }}-nextflow + restore-keys: | + ${{ runner.os }}-nextflow + - name: Install Nextflox + if: ${{ steps.cache-nextflow.outputs.cache-hit != 'true' }} + run: curl -s https://get.nextflow.io | bash && mv nextflow /usr/local/bin && chmod +x /usr/local/bin/nextflow + + # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it + - uses: actions/checkout@v3 + # Run tests sequentialy + - name: test + run: nextflow run -ansi-log -profile docker,test,ci MobiCT.nf + diff --git a/Example_data/panel_lung.bed b/Example_data/panel_lung.bed index 621c4a5..fe0c88b 100644 --- a/Example_data/panel_lung.bed +++ b/Example_data/panel_lung.bed @@ -1,8 +1,5 @@ -chr12 25245268 25245388 NM_033360 1 -chr15 90088586 90088747 NM_002168 1 -chr2 208248368 208248660 NM_005896 1 -chr7 55173921 55174041 NM_005228 1 -chr7 55174711 55174831 NM_005228 1 -chr7 55181292 55181478 NM_005228 1 -chr7 55191718 55191874 NM_005228 1 -chr7 140753274 140753394 NM_004333 1 +7 55173921 55174041 NM_005228 1 +7 55174711 55174831 NM_005228 1 +7 55181292 55181478 NM_005228 1 +7 55191718 55191874 NM_005228 1 +7 140753274 140753394 NM_004333 1 diff --git a/MobiCT.nf b/MobiCT.nf index 338c373..0106c7b 100644 --- a/MobiCT.nf +++ b/MobiCT.nf @@ -23,523 +23,87 @@ email = 's-cabelloaguilar@chu-montpellier.fr' ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -// PROCESSES - -// Convert the demultiplexed, raw sequencing FASTQ files to BAM -process ConvertFastqToSam { - tag "$sample_id" - - input: - tuple val('sample_id'), path(fastq) - val extension - - output: - tuple val(sample_id), file("${sample_id}${extension}.bam") - - """ - gatk FastqToSam \ - --FASTQ ${fastq[0]} \ - --FASTQ2 ${fastq[1]} \ - --OUTPUT ${sample_id}${extension}.bam \ - --SAMPLE_NAME ${sample_id} \ - --TMP_DIR ${params.tmp_dir} - """ -} - -// Extraction of UMIs from the insert reads -process ExtractUmis { - tag "$sample_id" - - input: - tuple val(sample_id), path(bam_file) - val struct_r1 - val struct_r2 - val extension - - output: - tuple val(sample_id), file("${sample_id}${extension}.bam") - - """ - fgbio ExtractUmisFromBam \ - -i ${bam_file} \ - -o ${sample_id}${extension}.bam \ - -r ${struct_r1} ${struct_r2} \ - -t RX \ - -a true - """ -} - -// Convert the BAM file with UMI extracted reads to a FASTQ file -process ConvertSamToFastq { - tag "$sample_id" - - input: - tuple val(sample_id), path(bam_file) - val extension - - output: - tuple val(sample_id), file("${sample_id}${extension}.R[1,2].fq") - - """ - gatk SamToFastq \ - -I ${bam_file} \ - -F ${sample_id}${extension}.R1.fq \ - -F2 ${sample_id}${extension}.R2.fq \ - --CLIPPING_ATTRIBUTE XT \ - --CLIPPING_ACTION 2 \ - --MAX_RECORDS_IN_RAM 50000000 - """ -} - -// Adapter and quality trimming -process Fastp { - tag "$sample_id" - - publishDir "${params.outdir}/${sample_id}", mode: 'copy', overwrite: true, pattern: '*.{json,html}' - - input: - tuple val(sample_id), path(fastq) - val extension - - output: - tuple val(sample_id), file("${sample_id}${extension}.R[1,2].fq") - file "${sample_id}.QC.fastp.json" - file "${sample_id}.QC.fastp.html" - - script: - """ - fastp \ - -i ${fastq[0]} \ - -o ${sample_id}${extension}.R1.fq \ - -I ${fastq[1]} \ - -O ${sample_id}${extension}.R2.fq \ - -g -W 5 -q 20 -u 40 -x -3 -l 75 -c \ - -j ${sample_id}.QC.fastp.json \ - -h ${sample_id}.QC.fastp.html \ - -w 12 - """ -} - -// Alignement before deduplication: Align quality and adapter trimmed reads to -// the reference genome -process BWAmem { - tag "$sample_id" - clusterOptions '-n 10' - - input: - tuple val(sample_id), path(fastq) - val opt_bwa - val extension - - output: - tuple val(sample_id), file("${sample_id}${extension}.bam") - - """ - bwa mem \ - ${opt_bwa} \ - -M ${params.ref} \ - ${fastq[0]} \ - ${fastq[1]} \ - | \ - samtools view -bh -o ${sample_id}${extension}.bam - """ -} - -// Merge the two BAM files containing: -// 1: the UMI information: output of ExtractUmis process -// 2: the alignment coordinate information: output of bwaMEM process -process MergeBam { - tag "$sample_id" - - input: - tuple val(sample_id), path(bam_aligned), path(bam_unmapped) - val extension - - - output: - tuple val(sample_id), file("${sample_id}${extension}.ba[i,m]") - - """ - gatk MergeBamAlignment \ - --ATTRIBUTES_TO_RETAIN X0 \ - --ATTRIBUTES_TO_REMOVE NM \ - --ATTRIBUTES_TO_REMOVE MD \ - --ALIGNED_BAM ${bam_aligned} \ - --UNMAPPED_BAM ${bam_unmapped} \ - --OUTPUT ${sample_id}${extension}.bam \ - --REFERENCE_SEQUENCE ${params.ref} \ - --SORT_ORDER 'queryname' \ - --ALIGNED_READS_ONLY true \ - --MAX_INSERTIONS_OR_DELETIONS -1 \ - --PRIMARY_ALIGNMENT_STRATEGY MostDistant \ - --ALIGNER_PROPER_PAIR_FLAGS true \ - --CREATE_INDEX true \ - --CLIP_OVERLAPPING_READS false - """ -} - -// -process UmiMergeFilt { - tag "$sample_id" - - input: - tuple val(sample_id), path(bam) - val extension - - output: - tuple val(sample_id), file("${sample_id}${extension}.bam") - - """ - samtools view \ - -f2 \ - -bh ${bam} \ - > ${sample_id}${extension}.bam - """ -} - -// Identify and group reads originating from the same source molecule -// The user can control how many errors/mismatches are allowed in the UMI sequence when assigning source molecules (--edits=n). -process GroupReads { - tag "$sample_id" - - publishDir "${params.outdir}/${sample_id}", mode: 'copy', overwrite: true, pattern: '*.txt' - - input: - tuple val(sample_id), path(bam) - val extension - - output: - tuple val(sample_id), file("${sample_id}${extension}.bam"), emit: nextout - file "${sample_id}.QC.family_size_counts.txt" - - """ - fgbio GroupReadsByUmi \ - -i ${bam} \ - -o ${sample_id}${extension}.bam \ - --strategy=adjacency \ - --edits=1 \ - -t RX \ - -f ${sample_id}.QC.family_size_counts.txt - """ -} - -// Generate consensus reads -// Calculate the consensus sequence, Reads that occur as singletons are -// discarded by default but this can be changed by setting the –min-reads flag -// to 1, in so doing the single read will be considered the consensus. -process CallConsensus { - tag "$sample_id" - - input: - tuple val(sample_id), path(bam) - val extension - - output: - tuple val(sample_id), file("${sample_id}${extension}.bam") - - """ - fgbio CallMolecularConsensusReads \ - -i ${bam} \ - -o ${sample_id}${extension}.bam \ - --error-rate-post-umi 40 \ - --error-rate-pre-umi 45 \ - --output-per-base-tags false \ - --min-reads 2 \ - --max-reads 50 \ - --read-name-prefix='consensus' - """ -} - -workflow RerunConvertSamToFastq { - take: - tuple_input - extension - main: - ConvertSamToFastq(tuple_input, extension) - emit: - final_out = ConvertSamToFastq.out -} - -workflow RerunBWAmem { - take: - tuple_input - opt_bwa - extension - main: - BWAmem(tuple_input, opt_bwa, extension) - emit: - final_out = BWAmem.out -} - -// Sort the consensus_mapped.bam with the consensus_unmapped.bam to prepare them as input for the next step -process SortConsensus { - tag "$sample_id" - - input: - tuple val(sample_id), path(bam) - val extension - - output: - tuple val(sample_id), path("${sample_id}${extension}.sort.bam") - - script: - - """ - gatk SortSam \ - -I ${bam} \ - --OUTPUT ${sample_id}${extension}.sort.bam \ - --SORT_ORDER queryname - """ -} - -workflow RerunSortConsensus { - take: - tuple_input - extension - main: - SortConsensus(tuple_input, extension) - emit: - final_out = SortConsensus.out -} - -// Finally, merge the consensus_mapped.bam with the consensus_unmapped.bam to -// retain the UMI group information. -process MergeBam2 { - tag "$sample_id" - - publishDir "${params.outdir}/${sample_id}", mode: 'copy', overwrite: true - - input: - tuple val(sample_id), path(bam_aligned), path(bam_unmapped) - val extension - - output: - tuple val(sample_id), path("${sample_id}${extension}.ba[m,i]") - - """ - gatk MergeBamAlignment \ - --ATTRIBUTES_TO_RETAIN X0 \ - --ATTRIBUTES_TO_RETAIN RX \ - --ALIGNED_BAM ${bam_aligned} \ - --UNMAPPED_BAM ${bam_unmapped} \ - --OUTPUT ${sample_id}${extension}.bam \ - --REFERENCE_SEQUENCE ${params.ref} \ - --SORT_ORDER coordinate \ - --ADD_MATE_CIGAR true \ - --MAX_INSERTIONS_OR_DELETIONS -1 \ - --PRIMARY_ALIGNMENT_STRATEGY MostDistant \ - --ALIGNER_PROPER_PAIR_FLAGS true \ - --CREATE_INDEX true \ - --CLIP_OVERLAPPING_READS false - """ -} - -// Variant calling step using vardict -process VarDict { - tag "$sample_id" - - input: - tuple val(sample_id), path(bami) - val extension - - output: - tuple val(sample_id), file("${sample_id}${extension}.vcf") - - """ - vardict \ - -G ${params.ref} \ - -f 0.0005 \ - -N ${sample_id} \ - -b ${bami[1]} \ - -c 1 \ - -S 2 \ - -E 3 \ - -g 4 ${params.bed} \ - | $params.teststrandbias \ - | $params.var2vcf > ${sample_id}${extension}.vcf - """ -} - -// Annotation step using VEP -process AnnotationVEP { - tag "$sample_id" - - publishDir "${params.outdir}/${sample_id}", mode: 'copy', overwrite: true - - input: - tuple val(sample_id), path(vcf) - val extension - - output: - tuple val(sample_id), file("${sample_id}${extension}.vcf") - - """ - vep \ - -i ${vcf} \ - -o ${sample_id}${extension}.vcf \ - --cache \ - --dir_cache ${params.cache} \ - --offline \ - --force_overwrite \ - --vcf \ - --numbers \ - --refseq \ - --symbol \ - --hgvs \ - --canonical \ - --max_af \ - --fasta ${params.fasta} - """ -} - -process BedToIntervalList { - - input: - path dict - path bed - val extension - - output: - file "${extension}.interval_list" - - """ - picard BedToIntervalList \ - --SEQUENCE_DICTIONARY ${dict} \ - --INPUT ${bed} \ - --OUTPUT ${extension}.interval_list - """ -} - -// Extraction of read quality metrics before deduplication -process CollectHsMetrics { - tag "$sample_id" - - publishDir "${params.outdir}/${sample_id}", mode: 'copy', overwrite: true - - input: - tuple val(sample_id), file(bam) - file bed - val extension - - output: - tuple val(sample_id), file("${sample_id}${extension}.txt") - - """ - picard CollectHsMetrics \ - --REFERENCE_SEQUENCE ${params.ref} \ - --BAIT_INTERVALS ${bed} \ - --TARGET_INTERVALS ${bed} \ - --INPUT ${bam} \ - --OUTPUT ${sample_id}${extension}.txt - """ -} - -workflow RerunCollectHsMetrics { - take: - tuple_input - bed - extension - main: - CollectHsMetrics(tuple_input, bed, extension) - emit: - final_out = CollectHsMetrics.out -} - -process BCFtools_stats { - tag "${sample_id}" - - publishDir "${params.outdir}/${sample_id}", mode: 'copy', overwrite: true - - input: - tuple val(sample_id), path(file) - val extension - - output: - tuple val(sample_id), file("${sample_id}${extension}.stats") - - """ - bcftools stats ${file} > ${sample_id}${extension}.stats - """ -} - -// Generate a multi-quality control report from collected metrics data (process -// collectmetrics2 output). -process MultiQC { - tag "${sample_id}" - - publishDir "${params.outdir}/${sample_id}", mode: 'copy', overwrite: true - - input: - tuple val(sample_id), path(file) - val extension - - output: - tuple val(sample_id), file("${sample_id}${extension}") - - """ - multiqc ${params.outdir}/${sample_id} -o ${sample_id}${extension} - """ -} +// Check profiles +if ( + workflow.profile.contains('singularity') || + workflow.profile.contains('docker') || + workflow.profile.contains('conda') + ) { "executer selected" } +else { exit 1, "No executer selected, please use: -profile docker/singularity/conda (choose one of the 3 possibilities)"} + +// Include modules +include {ConvertFastqToSam; MergeBam; MergeBam2; SortConsensus; SortConsensus as RerunSortConsensus; ConvertSamToFastq; ConvertSamToFastq as RerunConvertSamToFastq } from "$baseDir/modules/gatk.nf" +include {CreateSequenceDictionary; BedToIntervalList; CollectHsMetrics; CollectHsMetrics as RerunCollectHsMetrics;} from "$baseDir/modules/picard.nf" +include {BwaIndex; BWAmem; BWAmem as RerunBWAmem;} from "$baseDir/modules/bwamem.nf" +include {BCFtools_stats} from "$baseDir/modules/bcftools.nf" +include {Fastp} from "$baseDir/modules/fastp.nf" +include {ExtractUmis; GroupReads; CallConsensus; } from "$baseDir/modules/fgbio.nf" +include {MultiQC; MultiQC as MultiQC_ALL; } from "$baseDir/modules/multiqc.nf" +include {Faidx; UmiMergeFilt; Sam2bam; Sam2bam as ReSam2bam} from "$baseDir/modules/samtools.nf" +include {DealWithRef} from "$baseDir/modules/pigz.nf" +include {VarDict} from "$baseDir/modules/vardict.nf" +include {AnnotationVEP} from "$baseDir/modules/vep.nf" + + +// Define the workflow +workflow { + Channel.fromFilePairs(params.fastq, checkIfExists:true) + .filter{ v -> v=~ params.filter_fastq} + .set{read_pairs_fastq} + Channel.fromPath(params.ref, checkIfExists: true) + .ifEmpty { exit 1, "Cannot find genome matching ${params.ref}!\n" } + .set{genomeref} -process MultiQC_ALL { - publishDir "${params.outdir}", mode: 'copy', overwrite: true - input: - tuple val(sample_id), path(file) - val extension + // htslib-based tools (e.g., samtools, bcftools) Require BGZF-compressed FASTA and Use .fai index (samtools faidx) + DealWithRef(genomeref.collect()) + DealWithRef.out.genomerefok.set{genomerefok} - output: - file("${extension}") + Faidx(genomerefok).set{genome_fai} - """ - multiqc ${params.outdir} -o ${extension} - """ -} + // Create the genome index + BwaIndex(genomerefok) -workflow { - Channel.fromFilePairs(params.fastq, checkIfExists:true) - .filter{ v -> v=~ params.filter_fastq} - .set{read_pairs_fastq} + // Picard/GATK can use gzipped compressed fasta but need a .dict file + // Create the genome dictionary + CreateSequenceDictionary(genomerefok) + .set{genome_dict} // 1. Preprocess deduplication ConvertFastqToSam(read_pairs_fastq, ".1.unmapped") ExtractUmis(ConvertFastqToSam.out, params.struct_r1, params.struct_r2, ".1.umi_extracted") ConvertSamToFastq(ExtractUmis.out, ".1.umi_extracted") Fastp(ConvertSamToFastq.out, ".1.umi_extracted.trimmed") - BWAmem(Fastp.out[0], "-t 10", ".1.umi_extracted.aligned") + BWAmem(Fastp.out[0], genomerefok, BwaIndex.out, "", ".1.umi_extracted.aligned") + Sam2bam(BWAmem.out.tuple_sample_sam) - BWAmem.out.join(ExtractUmis.out).set{bams_umis} - MergeBam(bams_umis, ".1.merged") + Sam2bam.out.join(ExtractUmis.out).set{bams_umis} + MergeBam(bams_umis, genomerefok, genome_dict, ".1.merged") UmiMergeFilt(MergeBam.out, ".1.filtered") // 2. Process deduplication GroupReads(UmiMergeFilt.out, ".2.umi_grouped") - CallConsensus(GroupReads.out.nextout, ".2.consensus_unmapped") + CallConsensus(GroupReads.out.nextout, genome_fai, ".2.consensus_unmapped") // 3. Post process deduplication RerunConvertSamToFastq(CallConsensus.out, ".3.unmapped") - RerunBWAmem(RerunConvertSamToFastq.out.final_out, "-t 10 -Y", ".3.consensus_mapped") + RerunBWAmem(RerunConvertSamToFastq.out, genomerefok, BwaIndex.out, " -Y", ".3.consensus_mapped") SortConsensus(CallConsensus.out, ".3.unmapped") - RerunSortConsensus(RerunBWAmem.out.final_out, ".3.mapped") + ReSam2bam(RerunBWAmem.out.tuple_sample_sam) + RerunSortConsensus(ReSam2bam.out, ".3.mapped") - RerunSortConsensus.out.final_out.join(SortConsensus.out).set{bams_consensus} - MergeBam2(bams_consensus, ".3.merged") + RerunSortConsensus.out.join(SortConsensus.out).set{bams_consensus} + MergeBam2(bams_consensus, genomerefok, genome_dict, ".3.merged") // 4. Variant Calling & Annotation - VarDict(MergeBam2.out, ".4.vardict") - AnnotationVEP(VarDict.out, ".4.vardict.vep") + VarDict(MergeBam2.out, genomerefok, genome_fai, params.bed,".4.vardict") + AnnotationVEP(VarDict.out, genomerefok, ".4.vardict.vep") // Quality Controls - BedToIntervalList(params.dict, params.bed, params.bed.tokenize('.')[0].tokenize('/')[-1]) - CollectHsMetrics(BWAmem.out, BedToIntervalList.out, ".QC.HsMetrics.1") - RerunCollectHsMetrics(RerunBWAmem.out.final_out, BedToIntervalList.out, ".QC.HsMetrics.3") + BedToIntervalList(genome_dict, params.bed, params.bed.tokenize('.')[0].tokenize('/')[-1]) + CollectHsMetrics(Sam2bam.out, BedToIntervalList.out, genomerefok, genome_fai, ".QC.HsMetrics.1") + RerunCollectHsMetrics(ReSam2bam.out, BedToIntervalList.out, genomerefok, genome_fai, ".QC.HsMetrics.3") BCFtools_stats(VarDict.out, ".QC.bcftools_stats") MultiQC(BCFtools_stats.out, ".QC.multiQC") - - Channel.empty() - .mix( MultiQC.out ) - .map { sample, files -> files } - .collect() - .set { log_files } - MultiQC_ALL(log_files, "all.QC.multiQC") + MultiQC_ALL(BCFtools_stats.out.collect(), "all.QC.multiQC") } diff --git a/README.md b/README.md index 74e72ff..d08d2e0 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,22 @@ # MobiCT ctDNA Analysis pipeline. Version 1.0.0 -# Introduction +## Table of Contents + +* [Introduction](#introduction) +* [Quick Start](#quick-start) +* [Installation](#installation) + * [Nextflow](#nextflow) + * [Execution environment](#execution-environment) + * [Container platform](#container-platform) + * [Docker](#docker) + * [Singularity](#singularity) + * [Conda](conda) +* [Usage](#usage) +* [Pipeline output](#pipeline-output) +* [Data availability](#data-availability) + +## Introduction Mobict is an analysis pipeline designed for detecting SNV and small InDels in circulating tumor DNA (ctDNA) obtained through non-invasive liquid biopsy, serving diagnostic, prognostic, and therapeutic purposes. The pipeline is built using Nextflow. The associated publication is available here: /doi/.../ @@ -20,13 +35,118 @@ The associated publication is available here: /doi/.../ 4. Download the reference genome 5. Download the datasets needed by VEP (see https://github.com/Ensembl/ensembl-vep) 6. Edit the *.config* file with input and output files/paths -7. Activate your conda environment -8. Run MobiCT on your Dataset +7. Run MobiCT on your Dataset `Nextflow -log /output_directory/my.log run MobiCT.nf -c nextflow.config` -# Pipeline output +## Installation + +The prerequisites to run the pipeline are: + + * [Nextflow](https://www.nextflow.io/) >= 22.04.0 + * [Docker](https://www.docker.com) or [Singularity](https://sylabs.io/singularity/) or [conda](https://juke34.github.io/fix-anaconda-licensing-issues/en/pages/conda-distrib/) + +### Nextflow + + * Via conda + +
+ See here + + ```bash + conda create -n nextflow + conda activate nextflow + conda install bioconda::nextflow + ``` +
+ + * Manually +
+ See here + Nextflow runs on most POSIX systems (Linux, macOS, etc) and can typically be installed by running these commands: + + ```bash + # Make sure 11 or later is installed on your computer by using the command: + java -version + + # Install Nextflow by entering this command in your terminal(it creates a file nextflow in the current dir): + curl -s https://get.nextflow.io | bash + + # Add Nextflow binary to your user's PATH: + mv nextflow ~/bin/ + # OR system-wide installation: + # sudo mv nextflow /usr/local/bin + ``` +
+ +### Execution environment + +#### Container platform + +To run the workflow you will need a container platform: docker or singularity. + +##### Docker + +Please follow the instructions at the [Docker website](https://docs.docker.com/desktop/) + +##### Singularity + +Please follow the instructions at the [Singularity website](https://docs.sylabs.io/guides/latest/admin-guide/installation.html) + +#### Conda + +It is recommended to use a container platform. However, if you prefer to use Conda, please follow the instructions available [here](https://juke34.github.io/fix-anaconda-licensing-issues/en/pages/conda-installation/) + +## Usage + +### Profile + +To run the workflow you must select a profile according to the container platform you want to use: +- `singularity`, a profile using Singularity containers to run the softwares +- `docker`, a profile using Docker containers to run the softwares +- `conda`, a profile using conda environments to run the softwares + + +### From Release + +#### Default usage +The command will look like that: + +```bash +nextflow run SimCab-CHU/MobiCT -r v1.0.0 -profile docker +``` + +#### Test + +```bash +nextflow run SimCab-CHU/MobiCT -r v1.0.0 -profile docker,test +``` + +### From repo + +```bash +# clone the workflow repository +git clone https://github.com/SimCab-CHU/MobiCT.git + +# Move in it +cd MobiCT +``` +#### Default usage + +```bash +# Run MobiCT +nextflow run MobiCT.nf -profile docker +``` + +#### Test + +```bash +# Run MobiCT +nextflow run MobiCT.nf -profile docker,test +``` + +## Pipeline output The output files are stored in the directory you specified using the **outdir** parameter in the .config file. The **outdir** contains a folder per sample plus a **multiQC** folder. Each sample folder contains a deduplicated and aligned *.bam* file and its index, an annotated *.vcf* file, 3 metrics files (*.HsMetrics.1.txt*, *.HsMetrics.3.txt* and *QC.bcftools_stats.stats*). -# Data availability +## Data availability Raw sequencing data (FASTQ files) of commercial controls used in the study are available @ https://www.ncbi.nlm.nih.gov/sra/PRJNA1209006. diff --git a/config/ressources/ci.config b/config/ressources/ci.config new file mode 100644 index 0000000..968c27c --- /dev/null +++ b/config/ressources/ci.config @@ -0,0 +1,56 @@ +process { + cpus = 1 + time = '1h' + maxForks = 2 + shell = ['/bin/bash', '-euo', 'pipefail'] + stageOutMode = 'rsync' + + withLabel: 'bcftools' { + cpus = 2 + time = '1h' + } + withLabel: 'BwaIndex' { + cpus = 1 + time = '4h' + } + withLabel: 'BWAmem' { + cpus = 2 + time = '4h' + } + withLabel: 'fastp' { + cpus = 2 + time = '4h' + } + withLabel: 'fgbio' { + cpus = 2 + time = '4h' + } + withLabel: 'gatk4' { + cpus = 2 + time = '4h' + } + withLabel: 'pigz' { + cpus = 2 + time = '4h' + } + withLabel: 'multiqc' { + cpus = 2 + time = '2h' + } + withLabel: 'picard' { + cpus = 2 + time = '1h' + } + withLabel: 'samtools' { + cpus = 2 + time = '4h' + } + withLabel: 'vardict' { + cpus = 1 + time = '4h' + } + withLabel: 'vep' { + cpus = 2 + time = '4h' + } +} diff --git a/config/ressources/conda/bcftools.yaml b/config/ressources/conda/bcftools.yaml new file mode 100644 index 0000000..fc17e15 --- /dev/null +++ b/config/ressources/conda/bcftools.yaml @@ -0,0 +1,7 @@ +name: gatk4 +channels: + - conda-forge + - bioconda + +dependencies: + - bcftools=1.9 \ No newline at end of file diff --git a/config/ressources/conda/bwa.yaml b/config/ressources/conda/bwa.yaml new file mode 100644 index 0000000..7f710ae --- /dev/null +++ b/config/ressources/conda/bwa.yaml @@ -0,0 +1,7 @@ +name: gatk4 +channels: + - conda-forge + - bioconda + +dependencies: + - bwa=0.7.19 \ No newline at end of file diff --git a/config/ressources/conda/ensembl-vep.yaml b/config/ressources/conda/ensembl-vep.yaml new file mode 100644 index 0000000..a9ee1c5 --- /dev/null +++ b/config/ressources/conda/ensembl-vep.yaml @@ -0,0 +1,7 @@ +name: gatk4 +channels: + - conda-forge + - bioconda + +dependencies: + - ensembl-vep=113.0 \ No newline at end of file diff --git a/config/ressources/conda/fastp.yaml b/config/ressources/conda/fastp.yaml new file mode 100644 index 0000000..8672409 --- /dev/null +++ b/config/ressources/conda/fastp.yaml @@ -0,0 +1,7 @@ +name: gatk4 +channels: + - conda-forge + - bioconda + +dependencies: + - fastp=0.24.0 \ No newline at end of file diff --git a/config/ressources/conda/fgbio.yaml b/config/ressources/conda/fgbio.yaml new file mode 100644 index 0000000..348db93 --- /dev/null +++ b/config/ressources/conda/fgbio.yaml @@ -0,0 +1,7 @@ +name: gatk4 +channels: + - conda-forge + - bioconda + +dependencies: + - fgbio=2.5.0 \ No newline at end of file diff --git a/config/ressources/conda/gatk4.yaml b/config/ressources/conda/gatk4.yaml new file mode 100644 index 0000000..079a8dc --- /dev/null +++ b/config/ressources/conda/gatk4.yaml @@ -0,0 +1,7 @@ +name: gatk4 +channels: + - conda-forge + - bioconda + +dependencies: + - gatk4=4.6.1.0 \ No newline at end of file diff --git a/config/ressources/conda/multiqc.yaml b/config/ressources/conda/multiqc.yaml new file mode 100644 index 0000000..f630102 --- /dev/null +++ b/config/ressources/conda/multiqc.yaml @@ -0,0 +1,7 @@ +name: gatk4 +channels: + - conda-forge + - bioconda + +dependencies: + - multiqc=1.28 \ No newline at end of file diff --git a/config/ressources/conda/picard.yaml b/config/ressources/conda/picard.yaml new file mode 100644 index 0000000..61da0f6 --- /dev/null +++ b/config/ressources/conda/picard.yaml @@ -0,0 +1,7 @@ +name: gatk4 +channels: + - conda-forge + - bioconda + +dependencies: + - picard=3.3.0 \ No newline at end of file diff --git a/config/ressources/conda/pigz.yaml b/config/ressources/conda/pigz.yaml new file mode 100644 index 0000000..e18ebd6 --- /dev/null +++ b/config/ressources/conda/pigz.yaml @@ -0,0 +1,7 @@ +name: gatk4 +channels: + - conda-forge + - bioconda + +dependencies: + - pigz=2.8 \ No newline at end of file diff --git a/config/ressources/conda/samtools.yaml b/config/ressources/conda/samtools.yaml new file mode 100644 index 0000000..9beff77 --- /dev/null +++ b/config/ressources/conda/samtools.yaml @@ -0,0 +1,7 @@ +name: gatk4 +channels: + - conda-forge + - bioconda + +dependencies: + - samtools=1.6 \ No newline at end of file diff --git a/config/ressources/conda/vardict.yaml b/config/ressources/conda/vardict.yaml new file mode 100644 index 0000000..b6ab698 --- /dev/null +++ b/config/ressources/conda/vardict.yaml @@ -0,0 +1,7 @@ +name: gatk4 +channels: + - conda-forge + - bioconda + +dependencies: + - vardict-java=1.8.3 \ No newline at end of file diff --git a/config/ressources/hpc.config b/config/ressources/hpc.config new file mode 100644 index 0000000..1be18d1 --- /dev/null +++ b/config/ressources/hpc.config @@ -0,0 +1,56 @@ +process { + cpus = 1 + time = '1h' + maxForks = 20 + shell = ['/bin/bash', '-euo', 'pipefail'] + stageOutMode = 'rsync' + + withLabel: 'bcftools' { + cpus = 4 + time = '1h' + } + withName: 'BwaIndex' { + cpus = 1 + time = '4h' + } + withName: 'BWAmem' { + cpus = 16 + time = '4h' + } + withLabel: 'fastp' { + cpus = 4 + time = '4h' + } + withLabel: 'fgbio' { + cpus = 4 + time = '4h' + } + withLabel: 'gatk4' { + cpus = 4 + time = '4h' + } + withLabel: 'pigz' { + cpus = 8 + time = '4h' + } + withName: 'multiqc' { + cpus = 4 + time = '2h' + } + withLabel: 'picard' { + cpus = 4 + time = '1h' + } + withLabel: 'samtools' { + cpus = 8 + time = '4h' + } + withLabel: 'vardict' { + cpus = 1 + time = '4h' + } + withLabel: 'vep' { + cpus = 4 + time = '4h' + } +} diff --git a/config/ressources/local.config b/config/ressources/local.config new file mode 100644 index 0000000..327eeca --- /dev/null +++ b/config/ressources/local.config @@ -0,0 +1,56 @@ +process { + cpus = 1 + time = '1h' + maxForks = 4 + shell = ['/bin/bash', '-euo', 'pipefail'] + stageOutMode = 'rsync' + + withLabel: 'bcftools' { + cpus = 4 + time = '1h' + } + withName: 'BwaIndex' { + cpus = 1 + time = '4h' + } + withName: 'BWAmem' { + cpus = 4 + time = '4h' + } + withLabel: 'fastp' { + cpus = 4 + time = '4h' + } + withLabel: 'fgbio' { + cpus = 4 + time = '4h' + } + withLabel: 'gatk4' { + cpus = 4 + time = '4h' + } + withLabel: 'pigz' { + cpus = 4 + time = '4h' + } + withLabel: 'multiqc' { + cpus = 4 + time = '2h' + } + withLabel: 'picard' { + cpus = 4 + time = '1h' + } + withLabel: 'samtools' { + cpus = 4 + time = '4h' + } + withLabel: 'vardict' { + cpus = 1 + time = '4h' + } + withLabel: 'vep' { + cpus = 4 + time = '4h' + } +} diff --git a/config/softwares.config b/config/softwares.config new file mode 100644 index 0000000..05bbf04 --- /dev/null +++ b/config/softwares.config @@ -0,0 +1,57 @@ +process { + withLabel: 'bcftools' { + conda = { singularity.enabled || docker.enabled ? '' : "$baseDir/config/ressources/conda/bcftools.yaml" } + container = { singularity.enabled ? 'https://depot.galaxyproject.org/singularity/bcftools:1.9--h4da6232_0' : + docker.enabled ? 'quay.io/biocontainers/bcftools:1.9--h4da6232_0' : '' } + } + withLabel: 'bwamem' { + conda = { singularity.enabled || docker.enabled ? '' : "$baseDir/config/ressources/conda/bwa.yaml" } + container = { singularity.enabled ? 'https://depot.galaxyproject.org/singularity/bwa:0.7.19--h577a1d6_0' : + docker.enabled ? 'quay.io/biocontainers/bwa:0.7.19--h577a1d6_0' : '' } + } + withLabel: 'fastp' { + conda = { singularity.enabled || docker.enabled ? '' : "$baseDir/config/ressources/conda/fastp.yaml" } + container = { singularity.enabled ? 'https://depot.galaxyproject.org/singularity/fastp:0.24.0--heae3180_1' : + docker.enabled ? 'quay.io/biocontainers/fastp:0.24.0--heae3180_1' : '' } + } + withLabel: 'fgbio' { + conda = { singularity.enabled || docker.enabled ? '' : "$baseDir/config/ressources/conda/fgbio.yaml" } + container = { singularity.enabled ? 'https://depot.galaxyproject.org/singularity/fgbio:2.5.0--hdfd78af_1' : + docker.enabled ? 'quay.io/biocontainers/fgbio:2.5.0--hdfd78af_1' : '' } + } + withLabel: 'gatk4' { + conda = { singularity.enabled || docker.enabled ? '' : "$baseDir/config/ressources/conda/gatk4.yaml" } + container = { singularity.enabled ? 'https://depot.galaxyproject.org/singularity/gatk4:4.6.1.0--py310hdfd78af_0' : + docker.enabled ? 'quay.io/biocontainers/gatk4:4.6.1.0--py310hdfd78af_0' : '' } + } + withLabel: 'multiqc' { + conda = { singularity.enabled || docker.enabled ? '' : "$baseDir/config/ressources/conda/multiqc.yaml" } + container = { singularity.enabled ? 'https://depot.galaxyproject.org/singularity/multiqc:1.28--pyhdfd78af_0' : + docker.enabled ? 'quay.io/biocontainers/multiqc:1.28--pyhdfd78af_0' : '' } + } + withLabel: 'picard' { + conda = { singularity.enabled || docker.enabled ? '' : "$baseDir/config/ressources/conda/picard.yaml" } + container = { singularity.enabled ? 'https://depot.galaxyproject.org/singularity/picard:3.3.0--hdfd78af_0' : + docker.enabled ? 'quay.io/biocontainers/picard:3.3.0--hdfd78af_0' : '' } + } + withLabel: 'pigz' { + conda = { singularity.enabled || docker.enabled ? '' : "$baseDir/config/ressources/conda/pigz.yaml" } + container = { singularity.enabled ? 'https://depot.galaxyproject.org/singularity/pigz:2.8' : + docker.enabled ? 'quay.io/biocontainers/pigz:2.8' : '' } + } + withLabel: 'samtools' { + conda = { singularity.enabled || docker.enabled ? '' : "$baseDir/config/ressources/conda/samtools.yaml" } + container = { singularity.enabled ? 'https://depot.galaxyproject.org/singularity/samtools:1.6--h5fe306e_11' : + docker.enabled ? 'quay.io/biocontainers/samtools:1.6--h5fe306e_11' : '' } + } + withLabel: 'vardict' { + conda = { singularity.enabled || docker.enabled ? '' : "$baseDir/config/ressources/conda/vardict.yaml" } + container = { singularity.enabled ? 'https://depot.galaxyproject.org/singularity/vardict-java:1.8.3--hdfd78af_0' : + docker.enabled ? 'quay.io/biocontainers/vardict-java:1.8.3--hdfd78af_0' : '' } + } + withLabel: 'vep' { + conda = { singularity.enabled || docker.enabled ? '' : "$baseDir/config/ressources/conda/ensembl-vep.yaml" } + container = { singularity.enabled ? 'https://depot.galaxyproject.org/singularity/ensembl-vep:113.0--pl5321h2a3209d_0' : + docker.enabled ? 'quay.io/biocontainers/ensembl-vep:113.0--pl5321h2a3209d_0' : '' } + } +} \ No newline at end of file diff --git a/modules/bcftools.nf b/modules/bcftools.nf new file mode 100644 index 0000000..297d942 --- /dev/null +++ b/modules/bcftools.nf @@ -0,0 +1,17 @@ +process BCFtools_stats { + tag "${sample_id}" + label 'bcftools' + + publishDir "${params.outdir}/${sample_id}", mode: 'copy', overwrite: true + + input: + tuple val(sample_id), path(file) + val extension + + output: + tuple val(sample_id), file("${sample_id}${extension}.stats") + + """ + bcftools stats --threads ${task.cpus} ${file} > ${sample_id}${extension}.stats + """ +} diff --git a/modules/bwamem.nf b/modules/bwamem.nf new file mode 100644 index 0000000..ccf6d8b --- /dev/null +++ b/modules/bwamem.nf @@ -0,0 +1,44 @@ +process BwaIndex { + label 'bwamem' + tag "$genome_fasta" + + input: + path(genome_fasta) + + output: + path("*") + + script: + """ + bwa index $genome_fasta -p ${genome_fasta.baseName} + """ +} + +// Alignement before deduplication: Align quality and adapter trimmed reads to +// the reference genome +process BWAmem { + tag "$sample_id" + label 'bwamem' + + input: + tuple val(sample_id), path(fastq) + path genomeref + path bwa_index_files + val opt_bwa + val extension + + output: + tuple val(sample_id), file("${sample_id}${extension}.sam"), emit: tuple_sample_sam + path "*.log", emit: bwamem_summary + + """ + bwa mem \ + -t ${task.cpus} \ + ${opt_bwa} \ + -M \ + ${genomeref.baseName} \ + ${fastq[0]} \ + ${fastq[1]} \ + > ${sample_id}${extension}.sam 2> ${sample_id}${extension}.log + """ +} \ No newline at end of file diff --git a/modules/fastp.nf b/modules/fastp.nf new file mode 100644 index 0000000..c0ade77 --- /dev/null +++ b/modules/fastp.nf @@ -0,0 +1,30 @@ +// Adapter and quality trimming +process Fastp { + tag "$sample_id" + label 'fastp' + + publishDir "${params.outdir}/${sample_id}", mode: 'copy', overwrite: true, pattern: '*.{json,html}' + + input: + tuple val(sample_id), path(fastq) + val extension + + output: + tuple val(sample_id), file("${sample_id}${extension}.R[1,2].fq") + file "${sample_id}.QC.fastp.json" + file "${sample_id}.QC.fastp.html" + + script: + """ + fastp \ + --thread ${task.cpus} \ + -i ${fastq[0]} \ + -o ${sample_id}${extension}.R1.fq \ + -I ${fastq[1]} \ + -O ${sample_id}${extension}.R2.fq \ + -g -W 5 -q 20 -u 40 -x -3 -l 75 -c \ + -j ${sample_id}.QC.fastp.json \ + -h ${sample_id}.QC.fastp.html \ + -w 12 + """ +} \ No newline at end of file diff --git a/modules/fgbio.nf b/modules/fgbio.nf new file mode 100644 index 0000000..227a1b7 --- /dev/null +++ b/modules/fgbio.nf @@ -0,0 +1,81 @@ +// Extraction of UMIs from the insert reads +process ExtractUmis { + tag "$sample_id" + label 'fgbio' + + input: + tuple val(sample_id), path(bam_file) + val struct_r1 + val struct_r2 + val extension + + output: + tuple val(sample_id), file("${sample_id}${extension}.bam") + + script: + """ + fgbio ExtractUmisFromBam \ + -i ${bam_file} \ + -o ${sample_id}${extension}.bam \ + -r ${struct_r1} ${struct_r2} \ + -t RX \ + -a true + """ +} + +// Identify and group reads originating from the same source molecule +// The user can control how many errors/mismatches are allowed in the UMI sequence when assigning source molecules (--edits=n). +process GroupReads { + tag "$sample_id" + label 'fgbio' + publishDir "${params.outdir}/${sample_id}", mode: 'copy', overwrite: true, pattern: '*.txt' + + input: + tuple val(sample_id), path(bam) + val extension + + output: + tuple val(sample_id), file("${sample_id}${extension}.bam"), emit: nextout + file "${sample_id}.QC.family_size_counts.txt" + + script: + """ + fgbio GroupReadsByUmi \ + -i ${bam} \ + -o ${sample_id}${extension}.bam \ + --strategy=adjacency \ + --edits=1 \ + -t RX \ + -f ${sample_id}.QC.family_size_counts.txt + """ +} + +// Generate consensus reads +// Calculate the consensus sequence, Reads that occur as singletons are +// discarded by default but this can be changed by setting the –min-reads flag +// to 1, in so doing the single read will be considered the consensus. +process CallConsensus { + tag "$sample_id" + label 'fgbio' + + input: + tuple val(sample_id), path(bam) + path genome_fai + val extension + + output: + tuple val(sample_id), file("${sample_id}${extension}.bam") + + script: + """ + fgbio CallMolecularConsensusReads \ + -i ${bam} \ + -o ${sample_id}${extension}.bam \ + --error-rate-post-umi 40 \ + --error-rate-pre-umi 45 \ + --output-per-base-tags false \ + --min-reads 2 \ + --max-reads 50 \ + --read-name-prefix='consensus' + """ +} \ No newline at end of file diff --git a/modules/gatk.nf b/modules/gatk.nf new file mode 100644 index 0000000..10f00cd --- /dev/null +++ b/modules/gatk.nf @@ -0,0 +1,136 @@ +// Convert the demultiplexed, raw sequencing FASTQ files to BAM +process ConvertFastqToSam { + tag "$sample_id" + label 'gatk4' + + input: + tuple val('sample_id'), path(fastq) + val extension + + output: + tuple val(sample_id), file("${sample_id}${extension}.bam") + + """ + gatk FastqToSam \ + --FASTQ ${fastq[0]} \ + --FASTQ2 ${fastq[1]} \ + --OUTPUT ${sample_id}${extension}.bam \ + --SAMPLE_NAME ${sample_id} \ + """ +} + +// Convert the BAM file with UMI extracted reads to a FASTQ file +process ConvertSamToFastq { + tag "$sample_id" + label 'gatk4' + + input: + tuple val(sample_id), path(bam_file) + val extension + + output: + tuple val(sample_id), file("${sample_id}${extension}.R[1,2].fq") + + """ + gatk SamToFastq \ + -I ${bam_file} \ + -F ${sample_id}${extension}.R1.fq \ + -F2 ${sample_id}${extension}.R2.fq \ + --CLIPPING_ATTRIBUTE XT \ + --CLIPPING_ACTION 2 \ + --MAX_RECORDS_IN_RAM 50000000 + """ +} + +// Merge the two BAM files containing: +// 1: the UMI information: output of ExtractUmis process +// 2: the alignment coordinate information: output of bwaMEM process +process MergeBam { + tag "$sample_id" + label 'gatk4' + + input: + tuple val(sample_id), path(bam_aligned), path(bam_unmapped) + path genomeref + path genome_dict + val extension + + + output: + tuple val(sample_id), file("${sample_id}${extension}.ba[i,m]") + + """ + gatk MergeBamAlignment \ + --ATTRIBUTES_TO_RETAIN X0 \ + --ATTRIBUTES_TO_REMOVE NM \ + --ATTRIBUTES_TO_REMOVE MD \ + --ALIGNED_BAM ${bam_aligned} \ + --UNMAPPED_BAM ${bam_unmapped} \ + --OUTPUT ${sample_id}${extension}.bam \ + --REFERENCE_SEQUENCE ${genomeref} \ + --SORT_ORDER 'queryname' \ + --ALIGNED_READS_ONLY true \ + --MAX_INSERTIONS_OR_DELETIONS -1 \ + --PRIMARY_ALIGNMENT_STRATEGY MostDistant \ + --ALIGNER_PROPER_PAIR_FLAGS true \ + --CREATE_INDEX true \ + --CLIP_OVERLAPPING_READS false + """ +} + +// Sort the consensus_mapped.bam with the consensus_unmapped.bam to prepare them as input for the next step +process SortConsensus { + tag "$sample_id" + label 'gatk4' + + input: + tuple val(sample_id), path(bam) + val extension + + output: + tuple val(sample_id), path("${sample_id}${extension}.sort.bam") + + script: + + """ + gatk SortSam \ + -I ${bam} \ + --OUTPUT ${sample_id}${extension}.sort.bam \ + --SORT_ORDER queryname + """ +} + +// Finally, merge the consensus_mapped.bam with the consensus_unmapped.bam to +// retain the UMI group information. +process MergeBam2 { + tag "$sample_id" + label 'gatk4' + + publishDir "${params.outdir}/${sample_id}", mode: 'copy', overwrite: true + + input: + tuple val(sample_id), path(bam_aligned), path(bam_unmapped) + path genomeref + path genome_dict + val extension + + output: + tuple val(sample_id), path("${sample_id}${extension}.ba[m,i]") + + """ + gatk MergeBamAlignment \ + --ATTRIBUTES_TO_RETAIN X0 \ + --ATTRIBUTES_TO_RETAIN RX \ + --ALIGNED_BAM ${bam_aligned} \ + --UNMAPPED_BAM ${bam_unmapped} \ + --OUTPUT ${sample_id}${extension}.bam \ + --REFERENCE_SEQUENCE ${genomeref} \ + --SORT_ORDER coordinate \ + --ADD_MATE_CIGAR true \ + --MAX_INSERTIONS_OR_DELETIONS -1 \ + --PRIMARY_ALIGNMENT_STRATEGY MostDistant \ + --ALIGNER_PROPER_PAIR_FLAGS true \ + --CREATE_INDEX true \ + --CLIP_OVERLAPPING_READS false + """ +} \ No newline at end of file diff --git a/modules/multiqc.nf b/modules/multiqc.nf new file mode 100644 index 0000000..d1627d1 --- /dev/null +++ b/modules/multiqc.nf @@ -0,0 +1,19 @@ +// Generate a multi-quality control report from collected metrics data (process +// collectmetrics2 output). +process MultiQC { + tag "${sample_id}" + label 'multiqc' + + publishDir "${params.outdir}/${sample_id}", mode: 'copy', overwrite: true + + input: + tuple val(sample_id), path(file) + val extension + + output: + tuple val(sample_id), file("${sample_id}${extension}") + + """ + multiqc -p . -o ${sample_id}${extension} + """ +} \ No newline at end of file diff --git a/modules/picard.nf b/modules/picard.nf new file mode 100644 index 0000000..e638538 --- /dev/null +++ b/modules/picard.nf @@ -0,0 +1,59 @@ +process CreateSequenceDictionary { + label 'picard' + + input: + path genome + + output: + file "*.dict" + + """ + picard CreateSequenceDictionary R=${genome} O=${genome}.dict + """ +} + +process BedToIntervalList { + label 'picard' + + input: + path dict + path bed + val extension + + output: + file "${extension}.interval_list" + + """ + picard BedToIntervalList \ + --SEQUENCE_DICTIONARY ${dict} \ + --INPUT ${bed} \ + --OUTPUT ${extension}.interval_list + """ +} + +// Extraction of read quality metrics before deduplication +process CollectHsMetrics { + tag "$sample_id" + label 'picard' + + publishDir "${params.outdir}/${sample_id}", mode: 'copy', overwrite: true + + input: + tuple val(sample_id), file(bam) + file bed + path genomeref + path genome_fai + val extension + + output: + tuple val(sample_id), file("${sample_id}${extension}.txt") + + """ + picard CollectHsMetrics \ + --REFERENCE_SEQUENCE ${genomeref} \ + --BAIT_INTERVALS ${bed} \ + --TARGET_INTERVALS ${bed} \ + --INPUT ${bam} \ + --OUTPUT ${sample_id}${extension}.txt + """ +} diff --git a/modules/pigz.nf b/modules/pigz.nf new file mode 100644 index 0000000..25bb466 --- /dev/null +++ b/modules/pigz.nf @@ -0,0 +1,29 @@ +// +process DealWithRef { + tag "$genome_fasta" + label 'pigz' + + input: + path(genome_fasta) + + output: + path "*.fa", emit: genomerefok + + script: + genomeReady = genome_fasta.baseName.replaceAll(/\.(gz)$/, '') // remove .gz + genomeReady = genomeReady.replaceAll(/\.(fasta|fa)$/, '') // remove .fasta or .fa + genomeFa = genomeReady + ".fa" + """ + + + # DEALING WITH GZIPPED FILES to uncompress if needed + extension=\$(echo ${genome_fasta} | awk -F . '{print \$NF}') + + if [[ \${extension} == "gz" ]];then + pigz -dck -p ${task.cpus} ${genome_fasta} > ${genomeFa} + else + # copy fa + ln -s ${genome_fasta} ${genomeFa} + fi + """ +} \ No newline at end of file diff --git a/modules/samtools.nf b/modules/samtools.nf new file mode 100644 index 0000000..6954802 --- /dev/null +++ b/modules/samtools.nf @@ -0,0 +1,53 @@ +// +process Faidx { + tag "$genome_fasta" + label 'samtools' + + input: + path(genome_fasta) + + output: + path("*") + + """ + samtools faidx $genome_fasta + """ +} + +// +process UmiMergeFilt { + tag "$sample_id" + label 'samtools' + + input: + tuple val(sample_id), path(bam) + val extension + + output: + tuple val(sample_id), file("${sample_id}${extension}.bam") + + """ + samtools view \ + -f2 \ + -bh ${bam} \ + > ${sample_id}${extension}.bam + """ +} + +process Sam2bam { + label 'samtools' + tag "$sample" + + input: + tuple val(sample), path(sam) + + output: + tuple val(sample), path ("*.bam"), emit: tuple_sample_bam + + script: + + """ + samtools view -@ ${task.cpus} ${sam} -bh -o ${sam.baseName}.bam + """ + +} \ No newline at end of file diff --git a/modules/vardict.nf b/modules/vardict.nf new file mode 100644 index 0000000..f6e6e46 --- /dev/null +++ b/modules/vardict.nf @@ -0,0 +1,29 @@ +// Variant calling step using vardict +process VarDict { + tag "$sample_id" + label 'vardict' + + input: + tuple val(sample_id), path(bami) + path genomeref + path genomeref_fai + path bed + val extension + + output: + tuple val(sample_id), file("${sample_id}${extension}.vcf") + + """ + vardict-java \ + -G ${genomeref} \ + -f 0.0005 \ + -N ${sample_id} \ + -b ${bami[1]} \ + -c 1 \ + -S 2 \ + -E 3 \ + -g 4 ${bed} \ + | teststrandbias.R \ + | var2vcf_valid.pl > ${sample_id}${extension}.vcf + """ +} diff --git a/modules/vep.nf b/modules/vep.nf new file mode 100644 index 0000000..c95075e --- /dev/null +++ b/modules/vep.nf @@ -0,0 +1,33 @@ +// Annotation step using VEP +process AnnotationVEP { + tag "$sample_id" + label 'vep' + + publishDir "${params.outdir}/${sample_id}", mode: 'copy', overwrite: true + + input: + tuple val(sample_id), path(vcf) + path genomeref + val extension + + output: + tuple val(sample_id), file("${sample_id}${extension}.vcf") + + script: + def config = params.cache?.trim() ? "--offline --cache --dir_cache ${params.cache} --max_af " : "--database" + """ + vep \ + ${config} \ + --fork ${task.cpus} \ + -i ${vcf} \ + -o ${sample_id}${extension}.vcf \ + --force_overwrite \ + --vcf \ + --numbers \ + --refseq \ + --symbol \ + --hgvs \ + --canonical \ + --fasta ${genomeref} + """ +} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 9293def..c539348 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,21 +1,86 @@ -docker.enabled = true -conda.enabled = true +manifest { + name = 'MobiCT' + author = '' + homePage = 'https://github.com/SimCab-CHU/MobiCT' + description = 'ctDNA Analysis pipeline' + mainScript = 'MobiCT.nf' + nextflowVersion = '>=22.04.0' + version = '1.0.0' +} + +// meta parameters dag.overwrite = true -process.executor = 'slurm' -workDir = "/path/to/workDir" params { - struct_r1 = "5M2S+T" - struct_r2 = "5M2S+T" - ref = "/path/to/referenceGenome.fasta" - dict = "/path/to/referenceGenome.dict" - fastq = "/path/to/fastq/files/*_{R1,R2}_001.fastq.gz" - filter_fastq = /^((?!Undetermined).)*$/ - outdir = "/output/path" - bed = "/path/to/bedFile" - teststrandbias = "/path/to/teststrandbias.R/used/by/vardict" - var2vcf = "/path/to/var2vcf_valid.pl/used/by/vardict" - cache = "/path/to/cacheVEP" - fasta = "/path/to/fastaVEP" - tmp_dir = "/path/to/a/tmp/dir/" (optional) + // main params + ref = "/path/to/referenceGenome.fasta" // If you’re using a VEP cache (e.g. Ensembl’s), use the matching reference FASTA from Ensembl + fastq = "/path/to/fastq/files/*_{R1,R2}_001.fastq.gz" + outdir = "/path/to/outdir" + bed = "/path/to/bedFile" + cache = "" // Path to the VEP cache. If you’re using a VEP cache, use the matching reference FASTA from Ensembl: + + // secondary params + struct_r1 = "5M2S+T" + struct_r2 = "5M2S+T" + filter_fastq = /^((?!Undetermined).)*$/ + pipeline_report="pipeline_report" +} + +profiles { + slurm { + process.executor = 'slurm' + } + + docker { + docker.enabled = true + docker.runOptions='-u "$( id -u ):$( id -g )"' + includeConfig "$baseDir/config/softwares.config" + } + + singularity { + singularity.enabled = true + includeConfig "$baseDir/config/softwares.config" + } + conda { + conda.enabled = true + includeConfig "$baseDir/config/softwares.config" + } + // Use maximum 4 cpu per process + local { + includeConfig "$baseDir/config/ressources/local.config" + } + // Use maximum 2 cpu per process + ci { + includeConfig "$baseDir/config/ressources/ci.config" + } + test{ + params { + bed = "$baseDir/Example_data/panel_lung.bed" + fastq = "$baseDir/Example_data/*_{R1,R2}_001.fastq.gz" + outdir = "result_test" + ref = "https://ftp.ensembl.org/pub/release-113/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.7.fa.gz" + } + } +} + +resume = true + +timeline { + enabled = true + file = "${params.pipeline_report}/execution_timeline_${new Date().format('yyyyMMddHHmmss')}.html" +} + +report { + enabled = true + file = "${params.pipeline_report}/execution_report_${new Date().format('yyyyMMddHHmmss')}.html" +} + +trace { + enabled = true + file = "${params.pipeline_report}/execution_trace_${new Date().format('yyyyMMddHHmmss')}.txt" +} + +dag { + enabled = true + file = "${params.pipeline_report}/pipeline_dag.svg" }