diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..fff23c8
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,62 @@
+# This is a basic workflow to help you get started with Actions
+
+name: CI
+
+# Controls when the action will run.
+# Triggers the workflow on push or pull request events
+on: [push, pull_request, workflow_dispatch]
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+ cancel-in-progress: true
+
+# A workflow run is made up of one or more jobs that can run sequentially or in parallel
+jobs:
+
+ # This workflow contains a second job called "build2"
+ test_MobiCT:
+ # avoid to run twice push and PR
+ if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name
+
+ # The type of runner that the job will run on
+ runs-on: ubuntu-24.04
+ services:
+ docker:
+ image: docker:dind
+ options: --privileged --shm-size=2g
+ volumes:
+ - /var/run/docker.sock:/var/run/docker.sock:ro
+
+ # Steps represent a sequence of tasks that will be executed as part of the job
+ steps:
+ # Install/cache OpenJDK
+ - name: Cache OpenJDK
+ id: cache-openjdk
+ uses: actions/cache@v3
+ with:
+ path: /usr/lib/jvm/java-11-openjdk-amd64
+ key: ${{ runner.os }}-openjdk-11
+ restore-keys: |
+ ${{ runner.os }}-openjdk-11
+ - name: Install openjdk
+ if: ${{ steps.cache-openjdk.outputs.cache-hit != 'true' }}
+ run: sudo apt-get install openjdk-11-jdk
+ # Install/cache nextflow
+ - name: Cache nextflow
+ id: cache-nextflow
+ uses: actions/cache@v3
+ with:
+ path: /usr/local/bin/nextflow
+ key: ${{ runner.os }}-nextflow
+ restore-keys: |
+ ${{ runner.os }}-nextflow
+ - name: Install Nextflox
+ if: ${{ steps.cache-nextflow.outputs.cache-hit != 'true' }}
+ run: curl -s https://get.nextflow.io | bash && mv nextflow /usr/local/bin && chmod +x /usr/local/bin/nextflow
+
+ # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+ - uses: actions/checkout@v3
+ # Run tests sequentialy
+ - name: test
+ run: nextflow run -ansi-log -profile docker,test,ci MobiCT.nf
+
diff --git a/Example_data/panel_lung.bed b/Example_data/panel_lung.bed
index 621c4a5..fe0c88b 100644
--- a/Example_data/panel_lung.bed
+++ b/Example_data/panel_lung.bed
@@ -1,8 +1,5 @@
-chr12 25245268 25245388 NM_033360 1
-chr15 90088586 90088747 NM_002168 1
-chr2 208248368 208248660 NM_005896 1
-chr7 55173921 55174041 NM_005228 1
-chr7 55174711 55174831 NM_005228 1
-chr7 55181292 55181478 NM_005228 1
-chr7 55191718 55191874 NM_005228 1
-chr7 140753274 140753394 NM_004333 1
+7 55173921 55174041 NM_005228 1
+7 55174711 55174831 NM_005228 1
+7 55181292 55181478 NM_005228 1
+7 55191718 55191874 NM_005228 1
+7 140753274 140753394 NM_004333 1
diff --git a/MobiCT.nf b/MobiCT.nf
index 338c373..0106c7b 100644
--- a/MobiCT.nf
+++ b/MobiCT.nf
@@ -23,523 +23,87 @@ email = 's-cabelloaguilar@chu-montpellier.fr'
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/
-// PROCESSES
-
-// Convert the demultiplexed, raw sequencing FASTQ files to BAM
-process ConvertFastqToSam {
- tag "$sample_id"
-
- input:
- tuple val('sample_id'), path(fastq)
- val extension
-
- output:
- tuple val(sample_id), file("${sample_id}${extension}.bam")
-
- """
- gatk FastqToSam \
- --FASTQ ${fastq[0]} \
- --FASTQ2 ${fastq[1]} \
- --OUTPUT ${sample_id}${extension}.bam \
- --SAMPLE_NAME ${sample_id} \
- --TMP_DIR ${params.tmp_dir}
- """
-}
-
-// Extraction of UMIs from the insert reads
-process ExtractUmis {
- tag "$sample_id"
-
- input:
- tuple val(sample_id), path(bam_file)
- val struct_r1
- val struct_r2
- val extension
-
- output:
- tuple val(sample_id), file("${sample_id}${extension}.bam")
-
- """
- fgbio ExtractUmisFromBam \
- -i ${bam_file} \
- -o ${sample_id}${extension}.bam \
- -r ${struct_r1} ${struct_r2} \
- -t RX \
- -a true
- """
-}
-
-// Convert the BAM file with UMI extracted reads to a FASTQ file
-process ConvertSamToFastq {
- tag "$sample_id"
-
- input:
- tuple val(sample_id), path(bam_file)
- val extension
-
- output:
- tuple val(sample_id), file("${sample_id}${extension}.R[1,2].fq")
-
- """
- gatk SamToFastq \
- -I ${bam_file} \
- -F ${sample_id}${extension}.R1.fq \
- -F2 ${sample_id}${extension}.R2.fq \
- --CLIPPING_ATTRIBUTE XT \
- --CLIPPING_ACTION 2 \
- --MAX_RECORDS_IN_RAM 50000000
- """
-}
-
-// Adapter and quality trimming
-process Fastp {
- tag "$sample_id"
-
- publishDir "${params.outdir}/${sample_id}", mode: 'copy', overwrite: true, pattern: '*.{json,html}'
-
- input:
- tuple val(sample_id), path(fastq)
- val extension
-
- output:
- tuple val(sample_id), file("${sample_id}${extension}.R[1,2].fq")
- file "${sample_id}.QC.fastp.json"
- file "${sample_id}.QC.fastp.html"
-
- script:
- """
- fastp \
- -i ${fastq[0]} \
- -o ${sample_id}${extension}.R1.fq \
- -I ${fastq[1]} \
- -O ${sample_id}${extension}.R2.fq \
- -g -W 5 -q 20 -u 40 -x -3 -l 75 -c \
- -j ${sample_id}.QC.fastp.json \
- -h ${sample_id}.QC.fastp.html \
- -w 12
- """
-}
-
-// Alignement before deduplication: Align quality and adapter trimmed reads to
-// the reference genome
-process BWAmem {
- tag "$sample_id"
- clusterOptions '-n 10'
-
- input:
- tuple val(sample_id), path(fastq)
- val opt_bwa
- val extension
-
- output:
- tuple val(sample_id), file("${sample_id}${extension}.bam")
-
- """
- bwa mem \
- ${opt_bwa} \
- -M ${params.ref} \
- ${fastq[0]} \
- ${fastq[1]} \
- | \
- samtools view -bh -o ${sample_id}${extension}.bam
- """
-}
-
-// Merge the two BAM files containing:
-// 1: the UMI information: output of ExtractUmis process
-// 2: the alignment coordinate information: output of bwaMEM process
-process MergeBam {
- tag "$sample_id"
-
- input:
- tuple val(sample_id), path(bam_aligned), path(bam_unmapped)
- val extension
-
-
- output:
- tuple val(sample_id), file("${sample_id}${extension}.ba[i,m]")
-
- """
- gatk MergeBamAlignment \
- --ATTRIBUTES_TO_RETAIN X0 \
- --ATTRIBUTES_TO_REMOVE NM \
- --ATTRIBUTES_TO_REMOVE MD \
- --ALIGNED_BAM ${bam_aligned} \
- --UNMAPPED_BAM ${bam_unmapped} \
- --OUTPUT ${sample_id}${extension}.bam \
- --REFERENCE_SEQUENCE ${params.ref} \
- --SORT_ORDER 'queryname' \
- --ALIGNED_READS_ONLY true \
- --MAX_INSERTIONS_OR_DELETIONS -1 \
- --PRIMARY_ALIGNMENT_STRATEGY MostDistant \
- --ALIGNER_PROPER_PAIR_FLAGS true \
- --CREATE_INDEX true \
- --CLIP_OVERLAPPING_READS false
- """
-}
-
-//
-process UmiMergeFilt {
- tag "$sample_id"
-
- input:
- tuple val(sample_id), path(bam)
- val extension
-
- output:
- tuple val(sample_id), file("${sample_id}${extension}.bam")
-
- """
- samtools view \
- -f2 \
- -bh ${bam} \
- > ${sample_id}${extension}.bam
- """
-}
-
-// Identify and group reads originating from the same source molecule
-// The user can control how many errors/mismatches are allowed in the UMI sequence when assigning source molecules (--edits=n).
-process GroupReads {
- tag "$sample_id"
-
- publishDir "${params.outdir}/${sample_id}", mode: 'copy', overwrite: true, pattern: '*.txt'
-
- input:
- tuple val(sample_id), path(bam)
- val extension
-
- output:
- tuple val(sample_id), file("${sample_id}${extension}.bam"), emit: nextout
- file "${sample_id}.QC.family_size_counts.txt"
-
- """
- fgbio GroupReadsByUmi \
- -i ${bam} \
- -o ${sample_id}${extension}.bam \
- --strategy=adjacency \
- --edits=1 \
- -t RX \
- -f ${sample_id}.QC.family_size_counts.txt
- """
-}
-
-// Generate consensus reads
-// Calculate the consensus sequence, Reads that occur as singletons are
-// discarded by default but this can be changed by setting the –min-reads flag
-// to 1, in so doing the single read will be considered the consensus.
-process CallConsensus {
- tag "$sample_id"
-
- input:
- tuple val(sample_id), path(bam)
- val extension
-
- output:
- tuple val(sample_id), file("${sample_id}${extension}.bam")
-
- """
- fgbio CallMolecularConsensusReads \
- -i ${bam} \
- -o ${sample_id}${extension}.bam \
- --error-rate-post-umi 40 \
- --error-rate-pre-umi 45 \
- --output-per-base-tags false \
- --min-reads 2 \
- --max-reads 50 \
- --read-name-prefix='consensus'
- """
-}
-
-workflow RerunConvertSamToFastq {
- take:
- tuple_input
- extension
- main:
- ConvertSamToFastq(tuple_input, extension)
- emit:
- final_out = ConvertSamToFastq.out
-}
-
-workflow RerunBWAmem {
- take:
- tuple_input
- opt_bwa
- extension
- main:
- BWAmem(tuple_input, opt_bwa, extension)
- emit:
- final_out = BWAmem.out
-}
-
-// Sort the consensus_mapped.bam with the consensus_unmapped.bam to prepare them as input for the next step
-process SortConsensus {
- tag "$sample_id"
-
- input:
- tuple val(sample_id), path(bam)
- val extension
-
- output:
- tuple val(sample_id), path("${sample_id}${extension}.sort.bam")
-
- script:
-
- """
- gatk SortSam \
- -I ${bam} \
- --OUTPUT ${sample_id}${extension}.sort.bam \
- --SORT_ORDER queryname
- """
-}
-
-workflow RerunSortConsensus {
- take:
- tuple_input
- extension
- main:
- SortConsensus(tuple_input, extension)
- emit:
- final_out = SortConsensus.out
-}
-
-// Finally, merge the consensus_mapped.bam with the consensus_unmapped.bam to
-// retain the UMI group information.
-process MergeBam2 {
- tag "$sample_id"
-
- publishDir "${params.outdir}/${sample_id}", mode: 'copy', overwrite: true
-
- input:
- tuple val(sample_id), path(bam_aligned), path(bam_unmapped)
- val extension
-
- output:
- tuple val(sample_id), path("${sample_id}${extension}.ba[m,i]")
-
- """
- gatk MergeBamAlignment \
- --ATTRIBUTES_TO_RETAIN X0 \
- --ATTRIBUTES_TO_RETAIN RX \
- --ALIGNED_BAM ${bam_aligned} \
- --UNMAPPED_BAM ${bam_unmapped} \
- --OUTPUT ${sample_id}${extension}.bam \
- --REFERENCE_SEQUENCE ${params.ref} \
- --SORT_ORDER coordinate \
- --ADD_MATE_CIGAR true \
- --MAX_INSERTIONS_OR_DELETIONS -1 \
- --PRIMARY_ALIGNMENT_STRATEGY MostDistant \
- --ALIGNER_PROPER_PAIR_FLAGS true \
- --CREATE_INDEX true \
- --CLIP_OVERLAPPING_READS false
- """
-}
-
-// Variant calling step using vardict
-process VarDict {
- tag "$sample_id"
-
- input:
- tuple val(sample_id), path(bami)
- val extension
-
- output:
- tuple val(sample_id), file("${sample_id}${extension}.vcf")
-
- """
- vardict \
- -G ${params.ref} \
- -f 0.0005 \
- -N ${sample_id} \
- -b ${bami[1]} \
- -c 1 \
- -S 2 \
- -E 3 \
- -g 4 ${params.bed} \
- | $params.teststrandbias \
- | $params.var2vcf > ${sample_id}${extension}.vcf
- """
-}
-
-// Annotation step using VEP
-process AnnotationVEP {
- tag "$sample_id"
-
- publishDir "${params.outdir}/${sample_id}", mode: 'copy', overwrite: true
-
- input:
- tuple val(sample_id), path(vcf)
- val extension
-
- output:
- tuple val(sample_id), file("${sample_id}${extension}.vcf")
-
- """
- vep \
- -i ${vcf} \
- -o ${sample_id}${extension}.vcf \
- --cache \
- --dir_cache ${params.cache} \
- --offline \
- --force_overwrite \
- --vcf \
- --numbers \
- --refseq \
- --symbol \
- --hgvs \
- --canonical \
- --max_af \
- --fasta ${params.fasta}
- """
-}
-
-process BedToIntervalList {
-
- input:
- path dict
- path bed
- val extension
-
- output:
- file "${extension}.interval_list"
-
- """
- picard BedToIntervalList \
- --SEQUENCE_DICTIONARY ${dict} \
- --INPUT ${bed} \
- --OUTPUT ${extension}.interval_list
- """
-}
-
-// Extraction of read quality metrics before deduplication
-process CollectHsMetrics {
- tag "$sample_id"
-
- publishDir "${params.outdir}/${sample_id}", mode: 'copy', overwrite: true
-
- input:
- tuple val(sample_id), file(bam)
- file bed
- val extension
-
- output:
- tuple val(sample_id), file("${sample_id}${extension}.txt")
-
- """
- picard CollectHsMetrics \
- --REFERENCE_SEQUENCE ${params.ref} \
- --BAIT_INTERVALS ${bed} \
- --TARGET_INTERVALS ${bed} \
- --INPUT ${bam} \
- --OUTPUT ${sample_id}${extension}.txt
- """
-}
-
-workflow RerunCollectHsMetrics {
- take:
- tuple_input
- bed
- extension
- main:
- CollectHsMetrics(tuple_input, bed, extension)
- emit:
- final_out = CollectHsMetrics.out
-}
-
-process BCFtools_stats {
- tag "${sample_id}"
-
- publishDir "${params.outdir}/${sample_id}", mode: 'copy', overwrite: true
-
- input:
- tuple val(sample_id), path(file)
- val extension
-
- output:
- tuple val(sample_id), file("${sample_id}${extension}.stats")
-
- """
- bcftools stats ${file} > ${sample_id}${extension}.stats
- """
-}
-
-// Generate a multi-quality control report from collected metrics data (process
-// collectmetrics2 output).
-process MultiQC {
- tag "${sample_id}"
-
- publishDir "${params.outdir}/${sample_id}", mode: 'copy', overwrite: true
-
- input:
- tuple val(sample_id), path(file)
- val extension
-
- output:
- tuple val(sample_id), file("${sample_id}${extension}")
-
- """
- multiqc ${params.outdir}/${sample_id} -o ${sample_id}${extension}
- """
-}
+// Check profiles
+if (
+ workflow.profile.contains('singularity') ||
+ workflow.profile.contains('docker') ||
+ workflow.profile.contains('conda')
+ ) { "executer selected" }
+else { exit 1, "No executer selected, please use: -profile docker/singularity/conda (choose one of the 3 possibilities)"}
+
+// Include modules
+include {ConvertFastqToSam; MergeBam; MergeBam2; SortConsensus; SortConsensus as RerunSortConsensus; ConvertSamToFastq; ConvertSamToFastq as RerunConvertSamToFastq } from "$baseDir/modules/gatk.nf"
+include {CreateSequenceDictionary; BedToIntervalList; CollectHsMetrics; CollectHsMetrics as RerunCollectHsMetrics;} from "$baseDir/modules/picard.nf"
+include {BwaIndex; BWAmem; BWAmem as RerunBWAmem;} from "$baseDir/modules/bwamem.nf"
+include {BCFtools_stats} from "$baseDir/modules/bcftools.nf"
+include {Fastp} from "$baseDir/modules/fastp.nf"
+include {ExtractUmis; GroupReads; CallConsensus; } from "$baseDir/modules/fgbio.nf"
+include {MultiQC; MultiQC as MultiQC_ALL; } from "$baseDir/modules/multiqc.nf"
+include {Faidx; UmiMergeFilt; Sam2bam; Sam2bam as ReSam2bam} from "$baseDir/modules/samtools.nf"
+include {DealWithRef} from "$baseDir/modules/pigz.nf"
+include {VarDict} from "$baseDir/modules/vardict.nf"
+include {AnnotationVEP} from "$baseDir/modules/vep.nf"
+
+
+// Define the workflow
+workflow {
+ Channel.fromFilePairs(params.fastq, checkIfExists:true)
+ .filter{ v -> v=~ params.filter_fastq}
+ .set{read_pairs_fastq}
+ Channel.fromPath(params.ref, checkIfExists: true)
+ .ifEmpty { exit 1, "Cannot find genome matching ${params.ref}!\n" }
+ .set{genomeref}
-process MultiQC_ALL {
- publishDir "${params.outdir}", mode: 'copy', overwrite: true
- input:
- tuple val(sample_id), path(file)
- val extension
+ // htslib-based tools (e.g., samtools, bcftools) Require BGZF-compressed FASTA and Use .fai index (samtools faidx)
+ DealWithRef(genomeref.collect())
+ DealWithRef.out.genomerefok.set{genomerefok}
- output:
- file("${extension}")
+ Faidx(genomerefok).set{genome_fai}
- """
- multiqc ${params.outdir} -o ${extension}
- """
-}
+ // Create the genome index
+ BwaIndex(genomerefok)
-workflow {
- Channel.fromFilePairs(params.fastq, checkIfExists:true)
- .filter{ v -> v=~ params.filter_fastq}
- .set{read_pairs_fastq}
+ // Picard/GATK can use gzipped compressed fasta but need a .dict file
+ // Create the genome dictionary
+ CreateSequenceDictionary(genomerefok)
+ .set{genome_dict}
// 1. Preprocess deduplication
ConvertFastqToSam(read_pairs_fastq, ".1.unmapped")
ExtractUmis(ConvertFastqToSam.out, params.struct_r1, params.struct_r2, ".1.umi_extracted")
ConvertSamToFastq(ExtractUmis.out, ".1.umi_extracted")
Fastp(ConvertSamToFastq.out, ".1.umi_extracted.trimmed")
- BWAmem(Fastp.out[0], "-t 10", ".1.umi_extracted.aligned")
+ BWAmem(Fastp.out[0], genomerefok, BwaIndex.out, "", ".1.umi_extracted.aligned")
+ Sam2bam(BWAmem.out.tuple_sample_sam)
- BWAmem.out.join(ExtractUmis.out).set{bams_umis}
- MergeBam(bams_umis, ".1.merged")
+ Sam2bam.out.join(ExtractUmis.out).set{bams_umis}
+ MergeBam(bams_umis, genomerefok, genome_dict, ".1.merged")
UmiMergeFilt(MergeBam.out, ".1.filtered")
// 2. Process deduplication
GroupReads(UmiMergeFilt.out, ".2.umi_grouped")
- CallConsensus(GroupReads.out.nextout, ".2.consensus_unmapped")
+ CallConsensus(GroupReads.out.nextout, genome_fai, ".2.consensus_unmapped")
// 3. Post process deduplication
RerunConvertSamToFastq(CallConsensus.out, ".3.unmapped")
- RerunBWAmem(RerunConvertSamToFastq.out.final_out, "-t 10 -Y", ".3.consensus_mapped")
+ RerunBWAmem(RerunConvertSamToFastq.out, genomerefok, BwaIndex.out, " -Y", ".3.consensus_mapped")
SortConsensus(CallConsensus.out, ".3.unmapped")
- RerunSortConsensus(RerunBWAmem.out.final_out, ".3.mapped")
+ ReSam2bam(RerunBWAmem.out.tuple_sample_sam)
+ RerunSortConsensus(ReSam2bam.out, ".3.mapped")
- RerunSortConsensus.out.final_out.join(SortConsensus.out).set{bams_consensus}
- MergeBam2(bams_consensus, ".3.merged")
+ RerunSortConsensus.out.join(SortConsensus.out).set{bams_consensus}
+ MergeBam2(bams_consensus, genomerefok, genome_dict, ".3.merged")
// 4. Variant Calling & Annotation
- VarDict(MergeBam2.out, ".4.vardict")
- AnnotationVEP(VarDict.out, ".4.vardict.vep")
+ VarDict(MergeBam2.out, genomerefok, genome_fai, params.bed,".4.vardict")
+ AnnotationVEP(VarDict.out, genomerefok, ".4.vardict.vep")
// Quality Controls
- BedToIntervalList(params.dict, params.bed, params.bed.tokenize('.')[0].tokenize('/')[-1])
- CollectHsMetrics(BWAmem.out, BedToIntervalList.out, ".QC.HsMetrics.1")
- RerunCollectHsMetrics(RerunBWAmem.out.final_out, BedToIntervalList.out, ".QC.HsMetrics.3")
+ BedToIntervalList(genome_dict, params.bed, params.bed.tokenize('.')[0].tokenize('/')[-1])
+ CollectHsMetrics(Sam2bam.out, BedToIntervalList.out, genomerefok, genome_fai, ".QC.HsMetrics.1")
+ RerunCollectHsMetrics(ReSam2bam.out, BedToIntervalList.out, genomerefok, genome_fai, ".QC.HsMetrics.3")
BCFtools_stats(VarDict.out, ".QC.bcftools_stats")
MultiQC(BCFtools_stats.out, ".QC.multiQC")
-
- Channel.empty()
- .mix( MultiQC.out )
- .map { sample, files -> files }
- .collect()
- .set { log_files }
- MultiQC_ALL(log_files, "all.QC.multiQC")
+ MultiQC_ALL(BCFtools_stats.out.collect(), "all.QC.multiQC")
}
diff --git a/README.md b/README.md
index 74e72ff..d08d2e0 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,22 @@
# MobiCT
ctDNA Analysis pipeline. Version 1.0.0
-# Introduction
+## Table of Contents
+
+* [Introduction](#introduction)
+* [Quick Start](#quick-start)
+* [Installation](#installation)
+ * [Nextflow](#nextflow)
+ * [Execution environment](#execution-environment)
+ * [Container platform](#container-platform)
+ * [Docker](#docker)
+ * [Singularity](#singularity)
+ * [Conda](conda)
+* [Usage](#usage)
+* [Pipeline output](#pipeline-output)
+* [Data availability](#data-availability)
+
+## Introduction
Mobict is an analysis pipeline designed for detecting SNV and small InDels in circulating tumor DNA (ctDNA) obtained through non-invasive liquid biopsy, serving diagnostic, prognostic, and therapeutic purposes.
The pipeline is built using Nextflow.
The associated publication is available here: /doi/.../
@@ -20,13 +35,118 @@ The associated publication is available here: /doi/.../
4. Download the reference genome
5. Download the datasets needed by VEP (see https://github.com/Ensembl/ensembl-vep)
6. Edit the *.config* file with input and output files/paths
-7. Activate your conda environment
-8. Run MobiCT on your Dataset
+7. Run MobiCT on your Dataset
`Nextflow -log /output_directory/my.log run MobiCT.nf -c nextflow.config`
-# Pipeline output
+## Installation
+
+The prerequisites to run the pipeline are:
+
+ * [Nextflow](https://www.nextflow.io/) >= 22.04.0
+ * [Docker](https://www.docker.com) or [Singularity](https://sylabs.io/singularity/) or [conda](https://juke34.github.io/fix-anaconda-licensing-issues/en/pages/conda-distrib/)
+
+### Nextflow
+
+ * Via conda
+
+
+ See here
+
+ ```bash
+ conda create -n nextflow
+ conda activate nextflow
+ conda install bioconda::nextflow
+ ```
+
+
+ * Manually
+
+ See here
+ Nextflow runs on most POSIX systems (Linux, macOS, etc) and can typically be installed by running these commands:
+
+ ```bash
+ # Make sure 11 or later is installed on your computer by using the command:
+ java -version
+
+ # Install Nextflow by entering this command in your terminal(it creates a file nextflow in the current dir):
+ curl -s https://get.nextflow.io | bash
+
+ # Add Nextflow binary to your user's PATH:
+ mv nextflow ~/bin/
+ # OR system-wide installation:
+ # sudo mv nextflow /usr/local/bin
+ ```
+
+
+### Execution environment
+
+#### Container platform
+
+To run the workflow you will need a container platform: docker or singularity.
+
+##### Docker
+
+Please follow the instructions at the [Docker website](https://docs.docker.com/desktop/)
+
+##### Singularity
+
+Please follow the instructions at the [Singularity website](https://docs.sylabs.io/guides/latest/admin-guide/installation.html)
+
+#### Conda
+
+It is recommended to use a container platform. However, if you prefer to use Conda, please follow the instructions available [here](https://juke34.github.io/fix-anaconda-licensing-issues/en/pages/conda-installation/)
+
+## Usage
+
+### Profile
+
+To run the workflow you must select a profile according to the container platform you want to use:
+- `singularity`, a profile using Singularity containers to run the softwares
+- `docker`, a profile using Docker containers to run the softwares
+- `conda`, a profile using conda environments to run the softwares
+
+
+### From Release
+
+#### Default usage
+The command will look like that:
+
+```bash
+nextflow run SimCab-CHU/MobiCT -r v1.0.0 -profile docker
+```
+
+#### Test
+
+```bash
+nextflow run SimCab-CHU/MobiCT -r v1.0.0 -profile docker,test
+```
+
+### From repo
+
+```bash
+# clone the workflow repository
+git clone https://github.com/SimCab-CHU/MobiCT.git
+
+# Move in it
+cd MobiCT
+```
+#### Default usage
+
+```bash
+# Run MobiCT
+nextflow run MobiCT.nf -profile docker
+```
+
+#### Test
+
+```bash
+# Run MobiCT
+nextflow run MobiCT.nf -profile docker,test
+```
+
+## Pipeline output
The output files are stored in the directory you specified using the **outdir** parameter in the .config file. The **outdir** contains a folder per sample plus a **multiQC** folder. Each sample folder contains a deduplicated and aligned *.bam* file and its index, an annotated *.vcf* file, 3 metrics files (*.HsMetrics.1.txt*, *.HsMetrics.3.txt* and *QC.bcftools_stats.stats*).
-# Data availability
+## Data availability
Raw sequencing data (FASTQ files) of commercial controls used in the study are available @ https://www.ncbi.nlm.nih.gov/sra/PRJNA1209006.
diff --git a/config/ressources/ci.config b/config/ressources/ci.config
new file mode 100644
index 0000000..968c27c
--- /dev/null
+++ b/config/ressources/ci.config
@@ -0,0 +1,56 @@
+process {
+ cpus = 1
+ time = '1h'
+ maxForks = 2
+ shell = ['/bin/bash', '-euo', 'pipefail']
+ stageOutMode = 'rsync'
+
+ withLabel: 'bcftools' {
+ cpus = 2
+ time = '1h'
+ }
+ withLabel: 'BwaIndex' {
+ cpus = 1
+ time = '4h'
+ }
+ withLabel: 'BWAmem' {
+ cpus = 2
+ time = '4h'
+ }
+ withLabel: 'fastp' {
+ cpus = 2
+ time = '4h'
+ }
+ withLabel: 'fgbio' {
+ cpus = 2
+ time = '4h'
+ }
+ withLabel: 'gatk4' {
+ cpus = 2
+ time = '4h'
+ }
+ withLabel: 'pigz' {
+ cpus = 2
+ time = '4h'
+ }
+ withLabel: 'multiqc' {
+ cpus = 2
+ time = '2h'
+ }
+ withLabel: 'picard' {
+ cpus = 2
+ time = '1h'
+ }
+ withLabel: 'samtools' {
+ cpus = 2
+ time = '4h'
+ }
+ withLabel: 'vardict' {
+ cpus = 1
+ time = '4h'
+ }
+ withLabel: 'vep' {
+ cpus = 2
+ time = '4h'
+ }
+}
diff --git a/config/ressources/conda/bcftools.yaml b/config/ressources/conda/bcftools.yaml
new file mode 100644
index 0000000..fc17e15
--- /dev/null
+++ b/config/ressources/conda/bcftools.yaml
@@ -0,0 +1,7 @@
+name: gatk4
+channels:
+ - conda-forge
+ - bioconda
+
+dependencies:
+ - bcftools=1.9
\ No newline at end of file
diff --git a/config/ressources/conda/bwa.yaml b/config/ressources/conda/bwa.yaml
new file mode 100644
index 0000000..7f710ae
--- /dev/null
+++ b/config/ressources/conda/bwa.yaml
@@ -0,0 +1,7 @@
+name: gatk4
+channels:
+ - conda-forge
+ - bioconda
+
+dependencies:
+ - bwa=0.7.19
\ No newline at end of file
diff --git a/config/ressources/conda/ensembl-vep.yaml b/config/ressources/conda/ensembl-vep.yaml
new file mode 100644
index 0000000..a9ee1c5
--- /dev/null
+++ b/config/ressources/conda/ensembl-vep.yaml
@@ -0,0 +1,7 @@
+name: gatk4
+channels:
+ - conda-forge
+ - bioconda
+
+dependencies:
+ - ensembl-vep=113.0
\ No newline at end of file
diff --git a/config/ressources/conda/fastp.yaml b/config/ressources/conda/fastp.yaml
new file mode 100644
index 0000000..8672409
--- /dev/null
+++ b/config/ressources/conda/fastp.yaml
@@ -0,0 +1,7 @@
+name: gatk4
+channels:
+ - conda-forge
+ - bioconda
+
+dependencies:
+ - fastp=0.24.0
\ No newline at end of file
diff --git a/config/ressources/conda/fgbio.yaml b/config/ressources/conda/fgbio.yaml
new file mode 100644
index 0000000..348db93
--- /dev/null
+++ b/config/ressources/conda/fgbio.yaml
@@ -0,0 +1,7 @@
+name: gatk4
+channels:
+ - conda-forge
+ - bioconda
+
+dependencies:
+ - fgbio=2.5.0
\ No newline at end of file
diff --git a/config/ressources/conda/gatk4.yaml b/config/ressources/conda/gatk4.yaml
new file mode 100644
index 0000000..079a8dc
--- /dev/null
+++ b/config/ressources/conda/gatk4.yaml
@@ -0,0 +1,7 @@
+name: gatk4
+channels:
+ - conda-forge
+ - bioconda
+
+dependencies:
+ - gatk4=4.6.1.0
\ No newline at end of file
diff --git a/config/ressources/conda/multiqc.yaml b/config/ressources/conda/multiqc.yaml
new file mode 100644
index 0000000..f630102
--- /dev/null
+++ b/config/ressources/conda/multiqc.yaml
@@ -0,0 +1,7 @@
+name: gatk4
+channels:
+ - conda-forge
+ - bioconda
+
+dependencies:
+ - multiqc=1.28
\ No newline at end of file
diff --git a/config/ressources/conda/picard.yaml b/config/ressources/conda/picard.yaml
new file mode 100644
index 0000000..61da0f6
--- /dev/null
+++ b/config/ressources/conda/picard.yaml
@@ -0,0 +1,7 @@
+name: gatk4
+channels:
+ - conda-forge
+ - bioconda
+
+dependencies:
+ - picard=3.3.0
\ No newline at end of file
diff --git a/config/ressources/conda/pigz.yaml b/config/ressources/conda/pigz.yaml
new file mode 100644
index 0000000..e18ebd6
--- /dev/null
+++ b/config/ressources/conda/pigz.yaml
@@ -0,0 +1,7 @@
+name: gatk4
+channels:
+ - conda-forge
+ - bioconda
+
+dependencies:
+ - pigz=2.8
\ No newline at end of file
diff --git a/config/ressources/conda/samtools.yaml b/config/ressources/conda/samtools.yaml
new file mode 100644
index 0000000..9beff77
--- /dev/null
+++ b/config/ressources/conda/samtools.yaml
@@ -0,0 +1,7 @@
+name: gatk4
+channels:
+ - conda-forge
+ - bioconda
+
+dependencies:
+ - samtools=1.6
\ No newline at end of file
diff --git a/config/ressources/conda/vardict.yaml b/config/ressources/conda/vardict.yaml
new file mode 100644
index 0000000..b6ab698
--- /dev/null
+++ b/config/ressources/conda/vardict.yaml
@@ -0,0 +1,7 @@
+name: gatk4
+channels:
+ - conda-forge
+ - bioconda
+
+dependencies:
+ - vardict-java=1.8.3
\ No newline at end of file
diff --git a/config/ressources/hpc.config b/config/ressources/hpc.config
new file mode 100644
index 0000000..1be18d1
--- /dev/null
+++ b/config/ressources/hpc.config
@@ -0,0 +1,56 @@
+process {
+ cpus = 1
+ time = '1h'
+ maxForks = 20
+ shell = ['/bin/bash', '-euo', 'pipefail']
+ stageOutMode = 'rsync'
+
+ withLabel: 'bcftools' {
+ cpus = 4
+ time = '1h'
+ }
+ withName: 'BwaIndex' {
+ cpus = 1
+ time = '4h'
+ }
+ withName: 'BWAmem' {
+ cpus = 16
+ time = '4h'
+ }
+ withLabel: 'fastp' {
+ cpus = 4
+ time = '4h'
+ }
+ withLabel: 'fgbio' {
+ cpus = 4
+ time = '4h'
+ }
+ withLabel: 'gatk4' {
+ cpus = 4
+ time = '4h'
+ }
+ withLabel: 'pigz' {
+ cpus = 8
+ time = '4h'
+ }
+ withName: 'multiqc' {
+ cpus = 4
+ time = '2h'
+ }
+ withLabel: 'picard' {
+ cpus = 4
+ time = '1h'
+ }
+ withLabel: 'samtools' {
+ cpus = 8
+ time = '4h'
+ }
+ withLabel: 'vardict' {
+ cpus = 1
+ time = '4h'
+ }
+ withLabel: 'vep' {
+ cpus = 4
+ time = '4h'
+ }
+}
diff --git a/config/ressources/local.config b/config/ressources/local.config
new file mode 100644
index 0000000..327eeca
--- /dev/null
+++ b/config/ressources/local.config
@@ -0,0 +1,56 @@
+process {
+ cpus = 1
+ time = '1h'
+ maxForks = 4
+ shell = ['/bin/bash', '-euo', 'pipefail']
+ stageOutMode = 'rsync'
+
+ withLabel: 'bcftools' {
+ cpus = 4
+ time = '1h'
+ }
+ withName: 'BwaIndex' {
+ cpus = 1
+ time = '4h'
+ }
+ withName: 'BWAmem' {
+ cpus = 4
+ time = '4h'
+ }
+ withLabel: 'fastp' {
+ cpus = 4
+ time = '4h'
+ }
+ withLabel: 'fgbio' {
+ cpus = 4
+ time = '4h'
+ }
+ withLabel: 'gatk4' {
+ cpus = 4
+ time = '4h'
+ }
+ withLabel: 'pigz' {
+ cpus = 4
+ time = '4h'
+ }
+ withLabel: 'multiqc' {
+ cpus = 4
+ time = '2h'
+ }
+ withLabel: 'picard' {
+ cpus = 4
+ time = '1h'
+ }
+ withLabel: 'samtools' {
+ cpus = 4
+ time = '4h'
+ }
+ withLabel: 'vardict' {
+ cpus = 1
+ time = '4h'
+ }
+ withLabel: 'vep' {
+ cpus = 4
+ time = '4h'
+ }
+}
diff --git a/config/softwares.config b/config/softwares.config
new file mode 100644
index 0000000..05bbf04
--- /dev/null
+++ b/config/softwares.config
@@ -0,0 +1,57 @@
+process {
+ withLabel: 'bcftools' {
+ conda = { singularity.enabled || docker.enabled ? '' : "$baseDir/config/ressources/conda/bcftools.yaml" }
+ container = { singularity.enabled ? 'https://depot.galaxyproject.org/singularity/bcftools:1.9--h4da6232_0' :
+ docker.enabled ? 'quay.io/biocontainers/bcftools:1.9--h4da6232_0' : '' }
+ }
+ withLabel: 'bwamem' {
+ conda = { singularity.enabled || docker.enabled ? '' : "$baseDir/config/ressources/conda/bwa.yaml" }
+ container = { singularity.enabled ? 'https://depot.galaxyproject.org/singularity/bwa:0.7.19--h577a1d6_0' :
+ docker.enabled ? 'quay.io/biocontainers/bwa:0.7.19--h577a1d6_0' : '' }
+ }
+ withLabel: 'fastp' {
+ conda = { singularity.enabled || docker.enabled ? '' : "$baseDir/config/ressources/conda/fastp.yaml" }
+ container = { singularity.enabled ? 'https://depot.galaxyproject.org/singularity/fastp:0.24.0--heae3180_1' :
+ docker.enabled ? 'quay.io/biocontainers/fastp:0.24.0--heae3180_1' : '' }
+ }
+ withLabel: 'fgbio' {
+ conda = { singularity.enabled || docker.enabled ? '' : "$baseDir/config/ressources/conda/fgbio.yaml" }
+ container = { singularity.enabled ? 'https://depot.galaxyproject.org/singularity/fgbio:2.5.0--hdfd78af_1' :
+ docker.enabled ? 'quay.io/biocontainers/fgbio:2.5.0--hdfd78af_1' : '' }
+ }
+ withLabel: 'gatk4' {
+ conda = { singularity.enabled || docker.enabled ? '' : "$baseDir/config/ressources/conda/gatk4.yaml" }
+ container = { singularity.enabled ? 'https://depot.galaxyproject.org/singularity/gatk4:4.6.1.0--py310hdfd78af_0' :
+ docker.enabled ? 'quay.io/biocontainers/gatk4:4.6.1.0--py310hdfd78af_0' : '' }
+ }
+ withLabel: 'multiqc' {
+ conda = { singularity.enabled || docker.enabled ? '' : "$baseDir/config/ressources/conda/multiqc.yaml" }
+ container = { singularity.enabled ? 'https://depot.galaxyproject.org/singularity/multiqc:1.28--pyhdfd78af_0' :
+ docker.enabled ? 'quay.io/biocontainers/multiqc:1.28--pyhdfd78af_0' : '' }
+ }
+ withLabel: 'picard' {
+ conda = { singularity.enabled || docker.enabled ? '' : "$baseDir/config/ressources/conda/picard.yaml" }
+ container = { singularity.enabled ? 'https://depot.galaxyproject.org/singularity/picard:3.3.0--hdfd78af_0' :
+ docker.enabled ? 'quay.io/biocontainers/picard:3.3.0--hdfd78af_0' : '' }
+ }
+ withLabel: 'pigz' {
+ conda = { singularity.enabled || docker.enabled ? '' : "$baseDir/config/ressources/conda/pigz.yaml" }
+ container = { singularity.enabled ? 'https://depot.galaxyproject.org/singularity/pigz:2.8' :
+ docker.enabled ? 'quay.io/biocontainers/pigz:2.8' : '' }
+ }
+ withLabel: 'samtools' {
+ conda = { singularity.enabled || docker.enabled ? '' : "$baseDir/config/ressources/conda/samtools.yaml" }
+ container = { singularity.enabled ? 'https://depot.galaxyproject.org/singularity/samtools:1.6--h5fe306e_11' :
+ docker.enabled ? 'quay.io/biocontainers/samtools:1.6--h5fe306e_11' : '' }
+ }
+ withLabel: 'vardict' {
+ conda = { singularity.enabled || docker.enabled ? '' : "$baseDir/config/ressources/conda/vardict.yaml" }
+ container = { singularity.enabled ? 'https://depot.galaxyproject.org/singularity/vardict-java:1.8.3--hdfd78af_0' :
+ docker.enabled ? 'quay.io/biocontainers/vardict-java:1.8.3--hdfd78af_0' : '' }
+ }
+ withLabel: 'vep' {
+ conda = { singularity.enabled || docker.enabled ? '' : "$baseDir/config/ressources/conda/ensembl-vep.yaml" }
+ container = { singularity.enabled ? 'https://depot.galaxyproject.org/singularity/ensembl-vep:113.0--pl5321h2a3209d_0' :
+ docker.enabled ? 'quay.io/biocontainers/ensembl-vep:113.0--pl5321h2a3209d_0' : '' }
+ }
+}
\ No newline at end of file
diff --git a/modules/bcftools.nf b/modules/bcftools.nf
new file mode 100644
index 0000000..297d942
--- /dev/null
+++ b/modules/bcftools.nf
@@ -0,0 +1,17 @@
+process BCFtools_stats {
+ tag "${sample_id}"
+ label 'bcftools'
+
+ publishDir "${params.outdir}/${sample_id}", mode: 'copy', overwrite: true
+
+ input:
+ tuple val(sample_id), path(file)
+ val extension
+
+ output:
+ tuple val(sample_id), file("${sample_id}${extension}.stats")
+
+ """
+ bcftools stats --threads ${task.cpus} ${file} > ${sample_id}${extension}.stats
+ """
+}
diff --git a/modules/bwamem.nf b/modules/bwamem.nf
new file mode 100644
index 0000000..ccf6d8b
--- /dev/null
+++ b/modules/bwamem.nf
@@ -0,0 +1,44 @@
+process BwaIndex {
+ label 'bwamem'
+ tag "$genome_fasta"
+
+ input:
+ path(genome_fasta)
+
+ output:
+ path("*")
+
+ script:
+ """
+ bwa index $genome_fasta -p ${genome_fasta.baseName}
+ """
+}
+
+// Alignement before deduplication: Align quality and adapter trimmed reads to
+// the reference genome
+process BWAmem {
+ tag "$sample_id"
+ label 'bwamem'
+
+ input:
+ tuple val(sample_id), path(fastq)
+ path genomeref
+ path bwa_index_files
+ val opt_bwa
+ val extension
+
+ output:
+ tuple val(sample_id), file("${sample_id}${extension}.sam"), emit: tuple_sample_sam
+ path "*.log", emit: bwamem_summary
+
+ """
+ bwa mem \
+ -t ${task.cpus} \
+ ${opt_bwa} \
+ -M \
+ ${genomeref.baseName} \
+ ${fastq[0]} \
+ ${fastq[1]} \
+ > ${sample_id}${extension}.sam 2> ${sample_id}${extension}.log
+ """
+}
\ No newline at end of file
diff --git a/modules/fastp.nf b/modules/fastp.nf
new file mode 100644
index 0000000..c0ade77
--- /dev/null
+++ b/modules/fastp.nf
@@ -0,0 +1,30 @@
+// Adapter and quality trimming
+process Fastp {
+ tag "$sample_id"
+ label 'fastp'
+
+ publishDir "${params.outdir}/${sample_id}", mode: 'copy', overwrite: true, pattern: '*.{json,html}'
+
+ input:
+ tuple val(sample_id), path(fastq)
+ val extension
+
+ output:
+ tuple val(sample_id), file("${sample_id}${extension}.R[1,2].fq")
+ file "${sample_id}.QC.fastp.json"
+ file "${sample_id}.QC.fastp.html"
+
+ script:
+ """
+ fastp \
+ --thread ${task.cpus} \
+ -i ${fastq[0]} \
+ -o ${sample_id}${extension}.R1.fq \
+ -I ${fastq[1]} \
+ -O ${sample_id}${extension}.R2.fq \
+ -g -W 5 -q 20 -u 40 -x -3 -l 75 -c \
+ -j ${sample_id}.QC.fastp.json \
+ -h ${sample_id}.QC.fastp.html \
+ -w 12
+ """
+}
\ No newline at end of file
diff --git a/modules/fgbio.nf b/modules/fgbio.nf
new file mode 100644
index 0000000..227a1b7
--- /dev/null
+++ b/modules/fgbio.nf
@@ -0,0 +1,81 @@
+// Extraction of UMIs from the insert reads
+process ExtractUmis {
+ tag "$sample_id"
+ label 'fgbio'
+
+ input:
+ tuple val(sample_id), path(bam_file)
+ val struct_r1
+ val struct_r2
+ val extension
+
+ output:
+ tuple val(sample_id), file("${sample_id}${extension}.bam")
+
+ script:
+ """
+ fgbio ExtractUmisFromBam \
+ -i ${bam_file} \
+ -o ${sample_id}${extension}.bam \
+ -r ${struct_r1} ${struct_r2} \
+ -t RX \
+ -a true
+ """
+}
+
+// Identify and group reads originating from the same source molecule
+// The user can control how many errors/mismatches are allowed in the UMI sequence when assigning source molecules (--edits=n).
+process GroupReads {
+ tag "$sample_id"
+ label 'fgbio'
+ publishDir "${params.outdir}/${sample_id}", mode: 'copy', overwrite: true, pattern: '*.txt'
+
+ input:
+ tuple val(sample_id), path(bam)
+ val extension
+
+ output:
+ tuple val(sample_id), file("${sample_id}${extension}.bam"), emit: nextout
+ file "${sample_id}.QC.family_size_counts.txt"
+
+ script:
+ """
+ fgbio GroupReadsByUmi \
+ -i ${bam} \
+ -o ${sample_id}${extension}.bam \
+ --strategy=adjacency \
+ --edits=1 \
+ -t RX \
+ -f ${sample_id}.QC.family_size_counts.txt
+ """
+}
+
+// Generate consensus reads
+// Calculate the consensus sequence, Reads that occur as singletons are
+// discarded by default but this can be changed by setting the –min-reads flag
+// to 1, in so doing the single read will be considered the consensus.
+process CallConsensus {
+ tag "$sample_id"
+ label 'fgbio'
+
+ input:
+ tuple val(sample_id), path(bam)
+ path genome_fai
+ val extension
+
+ output:
+ tuple val(sample_id), file("${sample_id}${extension}.bam")
+
+ script:
+ """
+ fgbio CallMolecularConsensusReads \
+ -i ${bam} \
+ -o ${sample_id}${extension}.bam \
+ --error-rate-post-umi 40 \
+ --error-rate-pre-umi 45 \
+ --output-per-base-tags false \
+ --min-reads 2 \
+ --max-reads 50 \
+ --read-name-prefix='consensus'
+ """
+}
\ No newline at end of file
diff --git a/modules/gatk.nf b/modules/gatk.nf
new file mode 100644
index 0000000..10f00cd
--- /dev/null
+++ b/modules/gatk.nf
@@ -0,0 +1,136 @@
+// Convert the demultiplexed, raw sequencing FASTQ files to BAM
+process ConvertFastqToSam {
+ tag "$sample_id"
+ label 'gatk4'
+
+ input:
+ tuple val('sample_id'), path(fastq)
+ val extension
+
+ output:
+ tuple val(sample_id), file("${sample_id}${extension}.bam")
+
+ """
+ gatk FastqToSam \
+ --FASTQ ${fastq[0]} \
+ --FASTQ2 ${fastq[1]} \
+ --OUTPUT ${sample_id}${extension}.bam \
+ --SAMPLE_NAME ${sample_id} \
+ """
+}
+
+// Convert the BAM file with UMI extracted reads to a FASTQ file
+process ConvertSamToFastq {
+ tag "$sample_id"
+ label 'gatk4'
+
+ input:
+ tuple val(sample_id), path(bam_file)
+ val extension
+
+ output:
+ tuple val(sample_id), file("${sample_id}${extension}.R[1,2].fq")
+
+ """
+ gatk SamToFastq \
+ -I ${bam_file} \
+ -F ${sample_id}${extension}.R1.fq \
+ -F2 ${sample_id}${extension}.R2.fq \
+ --CLIPPING_ATTRIBUTE XT \
+ --CLIPPING_ACTION 2 \
+ --MAX_RECORDS_IN_RAM 50000000
+ """
+}
+
+// Merge the two BAM files containing:
+// 1: the UMI information: output of ExtractUmis process
+// 2: the alignment coordinate information: output of bwaMEM process
+process MergeBam {
+ tag "$sample_id"
+ label 'gatk4'
+
+ input:
+ tuple val(sample_id), path(bam_aligned), path(bam_unmapped)
+ path genomeref
+ path genome_dict
+ val extension
+
+
+ output:
+ tuple val(sample_id), file("${sample_id}${extension}.ba[i,m]")
+
+ """
+ gatk MergeBamAlignment \
+ --ATTRIBUTES_TO_RETAIN X0 \
+ --ATTRIBUTES_TO_REMOVE NM \
+ --ATTRIBUTES_TO_REMOVE MD \
+ --ALIGNED_BAM ${bam_aligned} \
+ --UNMAPPED_BAM ${bam_unmapped} \
+ --OUTPUT ${sample_id}${extension}.bam \
+ --REFERENCE_SEQUENCE ${genomeref} \
+ --SORT_ORDER 'queryname' \
+ --ALIGNED_READS_ONLY true \
+ --MAX_INSERTIONS_OR_DELETIONS -1 \
+ --PRIMARY_ALIGNMENT_STRATEGY MostDistant \
+ --ALIGNER_PROPER_PAIR_FLAGS true \
+ --CREATE_INDEX true \
+ --CLIP_OVERLAPPING_READS false
+ """
+}
+
+// Sort the consensus_mapped.bam with the consensus_unmapped.bam to prepare them as input for the next step
+process SortConsensus {
+ tag "$sample_id"
+ label 'gatk4'
+
+ input:
+ tuple val(sample_id), path(bam)
+ val extension
+
+ output:
+ tuple val(sample_id), path("${sample_id}${extension}.sort.bam")
+
+ script:
+
+ """
+ gatk SortSam \
+ -I ${bam} \
+ --OUTPUT ${sample_id}${extension}.sort.bam \
+ --SORT_ORDER queryname
+ """
+}
+
+// Finally, merge the consensus_mapped.bam with the consensus_unmapped.bam to
+// retain the UMI group information.
+process MergeBam2 {
+ tag "$sample_id"
+ label 'gatk4'
+
+ publishDir "${params.outdir}/${sample_id}", mode: 'copy', overwrite: true
+
+ input:
+ tuple val(sample_id), path(bam_aligned), path(bam_unmapped)
+ path genomeref
+ path genome_dict
+ val extension
+
+ output:
+ tuple val(sample_id), path("${sample_id}${extension}.ba[m,i]")
+
+ """
+ gatk MergeBamAlignment \
+ --ATTRIBUTES_TO_RETAIN X0 \
+ --ATTRIBUTES_TO_RETAIN RX \
+ --ALIGNED_BAM ${bam_aligned} \
+ --UNMAPPED_BAM ${bam_unmapped} \
+ --OUTPUT ${sample_id}${extension}.bam \
+ --REFERENCE_SEQUENCE ${genomeref} \
+ --SORT_ORDER coordinate \
+ --ADD_MATE_CIGAR true \
+ --MAX_INSERTIONS_OR_DELETIONS -1 \
+ --PRIMARY_ALIGNMENT_STRATEGY MostDistant \
+ --ALIGNER_PROPER_PAIR_FLAGS true \
+ --CREATE_INDEX true \
+ --CLIP_OVERLAPPING_READS false
+ """
+}
\ No newline at end of file
diff --git a/modules/multiqc.nf b/modules/multiqc.nf
new file mode 100644
index 0000000..d1627d1
--- /dev/null
+++ b/modules/multiqc.nf
@@ -0,0 +1,19 @@
+// Generate a multi-quality control report from collected metrics data (process
+// collectmetrics2 output).
+process MultiQC {
+ tag "${sample_id}"
+ label 'multiqc'
+
+ publishDir "${params.outdir}/${sample_id}", mode: 'copy', overwrite: true
+
+ input:
+ tuple val(sample_id), path(file)
+ val extension
+
+ output:
+ tuple val(sample_id), file("${sample_id}${extension}")
+
+ """
+ multiqc -p . -o ${sample_id}${extension}
+ """
+}
\ No newline at end of file
diff --git a/modules/picard.nf b/modules/picard.nf
new file mode 100644
index 0000000..e638538
--- /dev/null
+++ b/modules/picard.nf
@@ -0,0 +1,59 @@
+process CreateSequenceDictionary {
+ label 'picard'
+
+ input:
+ path genome
+
+ output:
+ file "*.dict"
+
+ """
+ picard CreateSequenceDictionary R=${genome} O=${genome}.dict
+ """
+}
+
+process BedToIntervalList {
+ label 'picard'
+
+ input:
+ path dict
+ path bed
+ val extension
+
+ output:
+ file "${extension}.interval_list"
+
+ """
+ picard BedToIntervalList \
+ --SEQUENCE_DICTIONARY ${dict} \
+ --INPUT ${bed} \
+ --OUTPUT ${extension}.interval_list
+ """
+}
+
+// Extraction of read quality metrics before deduplication
+process CollectHsMetrics {
+ tag "$sample_id"
+ label 'picard'
+
+ publishDir "${params.outdir}/${sample_id}", mode: 'copy', overwrite: true
+
+ input:
+ tuple val(sample_id), file(bam)
+ file bed
+ path genomeref
+ path genome_fai
+ val extension
+
+ output:
+ tuple val(sample_id), file("${sample_id}${extension}.txt")
+
+ """
+ picard CollectHsMetrics \
+ --REFERENCE_SEQUENCE ${genomeref} \
+ --BAIT_INTERVALS ${bed} \
+ --TARGET_INTERVALS ${bed} \
+ --INPUT ${bam} \
+ --OUTPUT ${sample_id}${extension}.txt
+ """
+}
diff --git a/modules/pigz.nf b/modules/pigz.nf
new file mode 100644
index 0000000..25bb466
--- /dev/null
+++ b/modules/pigz.nf
@@ -0,0 +1,29 @@
+//
+process DealWithRef {
+ tag "$genome_fasta"
+ label 'pigz'
+
+ input:
+ path(genome_fasta)
+
+ output:
+ path "*.fa", emit: genomerefok
+
+ script:
+ genomeReady = genome_fasta.baseName.replaceAll(/\.(gz)$/, '') // remove .gz
+ genomeReady = genomeReady.replaceAll(/\.(fasta|fa)$/, '') // remove .fasta or .fa
+ genomeFa = genomeReady + ".fa"
+ """
+
+
+ # DEALING WITH GZIPPED FILES to uncompress if needed
+ extension=\$(echo ${genome_fasta} | awk -F . '{print \$NF}')
+
+ if [[ \${extension} == "gz" ]];then
+ pigz -dck -p ${task.cpus} ${genome_fasta} > ${genomeFa}
+ else
+ # copy fa
+ ln -s ${genome_fasta} ${genomeFa}
+ fi
+ """
+}
\ No newline at end of file
diff --git a/modules/samtools.nf b/modules/samtools.nf
new file mode 100644
index 0000000..6954802
--- /dev/null
+++ b/modules/samtools.nf
@@ -0,0 +1,53 @@
+//
+process Faidx {
+ tag "$genome_fasta"
+ label 'samtools'
+
+ input:
+ path(genome_fasta)
+
+ output:
+ path("*")
+
+ """
+ samtools faidx $genome_fasta
+ """
+}
+
+//
+process UmiMergeFilt {
+ tag "$sample_id"
+ label 'samtools'
+
+ input:
+ tuple val(sample_id), path(bam)
+ val extension
+
+ output:
+ tuple val(sample_id), file("${sample_id}${extension}.bam")
+
+ """
+ samtools view \
+ -f2 \
+ -bh ${bam} \
+ > ${sample_id}${extension}.bam
+ """
+}
+
+process Sam2bam {
+ label 'samtools'
+ tag "$sample"
+
+ input:
+ tuple val(sample), path(sam)
+
+ output:
+ tuple val(sample), path ("*.bam"), emit: tuple_sample_bam
+
+ script:
+
+ """
+ samtools view -@ ${task.cpus} ${sam} -bh -o ${sam.baseName}.bam
+ """
+
+}
\ No newline at end of file
diff --git a/modules/vardict.nf b/modules/vardict.nf
new file mode 100644
index 0000000..f6e6e46
--- /dev/null
+++ b/modules/vardict.nf
@@ -0,0 +1,29 @@
+// Variant calling step using vardict
+process VarDict {
+ tag "$sample_id"
+ label 'vardict'
+
+ input:
+ tuple val(sample_id), path(bami)
+ path genomeref
+ path genomeref_fai
+ path bed
+ val extension
+
+ output:
+ tuple val(sample_id), file("${sample_id}${extension}.vcf")
+
+ """
+ vardict-java \
+ -G ${genomeref} \
+ -f 0.0005 \
+ -N ${sample_id} \
+ -b ${bami[1]} \
+ -c 1 \
+ -S 2 \
+ -E 3 \
+ -g 4 ${bed} \
+ | teststrandbias.R \
+ | var2vcf_valid.pl > ${sample_id}${extension}.vcf
+ """
+}
diff --git a/modules/vep.nf b/modules/vep.nf
new file mode 100644
index 0000000..c95075e
--- /dev/null
+++ b/modules/vep.nf
@@ -0,0 +1,33 @@
+// Annotation step using VEP
+process AnnotationVEP {
+ tag "$sample_id"
+ label 'vep'
+
+ publishDir "${params.outdir}/${sample_id}", mode: 'copy', overwrite: true
+
+ input:
+ tuple val(sample_id), path(vcf)
+ path genomeref
+ val extension
+
+ output:
+ tuple val(sample_id), file("${sample_id}${extension}.vcf")
+
+ script:
+ def config = params.cache?.trim() ? "--offline --cache --dir_cache ${params.cache} --max_af " : "--database"
+ """
+ vep \
+ ${config} \
+ --fork ${task.cpus} \
+ -i ${vcf} \
+ -o ${sample_id}${extension}.vcf \
+ --force_overwrite \
+ --vcf \
+ --numbers \
+ --refseq \
+ --symbol \
+ --hgvs \
+ --canonical \
+ --fasta ${genomeref}
+ """
+}
\ No newline at end of file
diff --git a/nextflow.config b/nextflow.config
index 9293def..c539348 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -1,21 +1,86 @@
-docker.enabled = true
-conda.enabled = true
+manifest {
+ name = 'MobiCT'
+ author = ''
+ homePage = 'https://github.com/SimCab-CHU/MobiCT'
+ description = 'ctDNA Analysis pipeline'
+ mainScript = 'MobiCT.nf'
+ nextflowVersion = '>=22.04.0'
+ version = '1.0.0'
+}
+
+// meta parameters
dag.overwrite = true
-process.executor = 'slurm'
-workDir = "/path/to/workDir"
params {
- struct_r1 = "5M2S+T"
- struct_r2 = "5M2S+T"
- ref = "/path/to/referenceGenome.fasta"
- dict = "/path/to/referenceGenome.dict"
- fastq = "/path/to/fastq/files/*_{R1,R2}_001.fastq.gz"
- filter_fastq = /^((?!Undetermined).)*$/
- outdir = "/output/path"
- bed = "/path/to/bedFile"
- teststrandbias = "/path/to/teststrandbias.R/used/by/vardict"
- var2vcf = "/path/to/var2vcf_valid.pl/used/by/vardict"
- cache = "/path/to/cacheVEP"
- fasta = "/path/to/fastaVEP"
- tmp_dir = "/path/to/a/tmp/dir/" (optional)
+ // main params
+ ref = "/path/to/referenceGenome.fasta" // If you’re using a VEP cache (e.g. Ensembl’s), use the matching reference FASTA from Ensembl
+ fastq = "/path/to/fastq/files/*_{R1,R2}_001.fastq.gz"
+ outdir = "/path/to/outdir"
+ bed = "/path/to/bedFile"
+ cache = "" // Path to the VEP cache. If you’re using a VEP cache, use the matching reference FASTA from Ensembl:
+
+ // secondary params
+ struct_r1 = "5M2S+T"
+ struct_r2 = "5M2S+T"
+ filter_fastq = /^((?!Undetermined).)*$/
+ pipeline_report="pipeline_report"
+}
+
+profiles {
+ slurm {
+ process.executor = 'slurm'
+ }
+
+ docker {
+ docker.enabled = true
+ docker.runOptions='-u "$( id -u ):$( id -g )"'
+ includeConfig "$baseDir/config/softwares.config"
+ }
+
+ singularity {
+ singularity.enabled = true
+ includeConfig "$baseDir/config/softwares.config"
+ }
+ conda {
+ conda.enabled = true
+ includeConfig "$baseDir/config/softwares.config"
+ }
+ // Use maximum 4 cpu per process
+ local {
+ includeConfig "$baseDir/config/ressources/local.config"
+ }
+ // Use maximum 2 cpu per process
+ ci {
+ includeConfig "$baseDir/config/ressources/ci.config"
+ }
+ test{
+ params {
+ bed = "$baseDir/Example_data/panel_lung.bed"
+ fastq = "$baseDir/Example_data/*_{R1,R2}_001.fastq.gz"
+ outdir = "result_test"
+ ref = "https://ftp.ensembl.org/pub/release-113/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.7.fa.gz"
+ }
+ }
+}
+
+resume = true
+
+timeline {
+ enabled = true
+ file = "${params.pipeline_report}/execution_timeline_${new Date().format('yyyyMMddHHmmss')}.html"
+}
+
+report {
+ enabled = true
+ file = "${params.pipeline_report}/execution_report_${new Date().format('yyyyMMddHHmmss')}.html"
+}
+
+trace {
+ enabled = true
+ file = "${params.pipeline_report}/execution_trace_${new Date().format('yyyyMMddHHmmss')}.txt"
+}
+
+dag {
+ enabled = true
+ file = "${params.pipeline_report}/pipeline_dag.svg"
}