diff --git a/config/softwares.config b/config/softwares.config index c790adb..d827ca4 100644 --- a/config/softwares.config +++ b/config/softwares.config @@ -2,6 +2,9 @@ process { withLabel: 'agat' { container = 'quay.io/biocontainers/agat:1.4.2--pl5321hdfd78af_0' } + withLabel: 'bash' { + container = 'ubuntu:24.04' + } withLabel: 'bamutil' { container = 'quay.io/biocontainers/bamutil:1.0.15--h43eeafb_4' } diff --git a/docker/reditools2/Dockerfile b/docker/reditools2/Dockerfile index e1cf014..08f5071 100644 --- a/docker/reditools2/Dockerfile +++ b/docker/reditools2/Dockerfile @@ -1,15 +1,11 @@ FROM condaforge/miniforge3:24.7.1-2 -# Auto-activating Conda setup for Python 2.7 -RUN conda config --add channels bioconda \ - && conda config --add channels conda-forge \ - && conda create -n myenv \ - && echo "conda activate myenv" >> ~/.bashrc - # Tool-specific dependencies COPY env_reditools2.yml . -RUN conda env update -n myenv --file env_reditools2.yml \ - && conda clean --all -f -y +RUN conda env create -n myenv --file env_reditools2.yml \ + && conda clean --all -f -y \ + && echo "conda activate myenv" >> ~/.bashrc # Auto-activating Conda setup for Python 2.7 + # Get and install tools RUN git clone --depth=1 https://github.com/BioinfoUNIBA/REDItools2.git reditools2 diff --git a/docker/reditools2/env_reditools2.yml b/docker/reditools2/env_reditools2.yml index 863b736..014b6be 100644 --- a/docker/reditools2/env_reditools2.yml +++ b/docker/reditools2/env_reditools2.yml @@ -1,3 +1,5 @@ +name: myenv + channels: - bioconda - conda-forge @@ -6,8 +8,8 @@ dependencies: - _openmp_mutex=4.5=2_gnu - attr=2.5.1=h166bdaf_1 - bzip2=1.0.8=h4bc722e_7 - - c-ares=1.34.4=hb9d3cd8_0 - - ca-certificates=2025.1.31=hbcca054_0 + - c-ares=1.34.5=hb9d3cd8_0 + - ca-certificates=2025.4.26=hbd8a1cb_0 - certifi=2019.11.28=py27h8c360ce_1 - htslib=1.17=h6bc39ce_1 - icu=73.2=h59595ed_0 @@ -16,55 +18,53 @@ dependencies: - ld_impl_linux-64=2.43=h712a8e2_4 - libcap=2.71=h39aace5_0 - libcurl=7.87.0=h6312ad2_0 - - libdeflate=1.23=h4ddbbb0_0 - - libedit=3.1.20250104=pl5321h7949ede_0 + - libdeflate=1.24=h86f0d12_0 + - libedit=3.1.20191231=he28a2e2_2 - libev=4.33=hd590300_2 - - libfabric=2.0.0=ha770c72_1 - - libfabric1=2.0.0=h14e6f36_1 + - libfabric=2.1.0=ha770c72_0 + - libfabric1=2.1.0=h14e6f36_0 - libffi=3.2.1=he1b5a44_1007 - - libgcc=14.2.0=h767d61c_2 - - libgcc-ng=14.2.0=h69a702a_2 - - libgcrypt-lib=1.11.0=hb9d3cd8_2 - - libgfortran=14.2.0=h69a702a_2 - - libgfortran5=14.2.0=hf1ad2bd_2 - - libgomp=14.2.0=h767d61c_2 - - libgpg-error=1.51=hbd13f7d_1 + - libgcc=15.1.0=h767d61c_2 + - libgcc-ng=15.1.0=h69a702a_2 + - libgcrypt-lib=1.11.1=hb9d3cd8_0 + - libgfortran=15.1.0=h69a702a_2 + - libgfortran5=15.1.0=hcea5267_2 + - libgomp=15.1.0=h767d61c_2 + - libgpg-error=1.55=h3f2d84a_0 - libhwloc=2.11.2=default_he43201b_1000 - libiconv=1.18=h4ce23a2_1 - - liblzma=5.6.4=hb9d3cd8_0 - libnghttp2=1.51.0=hdcd2b5c_0 - libnl=3.11.0=hb9d3cd8_0 - - libsqlite=3.46.0=hde9e2c9_0 - libssh2=1.10.0=haa6b8db_3 - - libstdcxx=14.2.0=h8f9b012_2 - - libstdcxx-ng=14.2.0=h4852527_2 - - libsystemd0=257.3=h3dc2cb9_0 - - libudev1=257.3=h9a4d06a_0 + - libstdcxx=15.1.0=h8f9b012_2 + - libstdcxx-ng=15.1.0=h4852527_2 + - libsystemd0=256.9=h2774228_0 + - libudev1=257.4=h9a4d06a_0 - libxml2=2.12.7=hc051c1a_1 - libzlib=1.2.13=h4ab18f5_6 - - lz4-c=1.10.0=h5888daf_1 + - lz4-c=1.9.4=hcb278e6_0 - mpi=1.0.1=mpich - mpi4py=3.0.3=py27hf423c55_1 - mpich=4.3.0=h1a8bee6_100 - - ncurses=6.5=h2d0b736_3 - - netifaces=0.10.9=py27hdf8410d_1002 + - ncurses=6.2=h58526e2_4 + - netifaces=0.10.4=py27_1 - openssl=1.1.1w=hd590300_0 - pip=20.1.1=pyh9f0ad1d_0 - psutil=5.7.0=py27hdf8410d_1 - pysam=0.20.0=py27h7835474_0 - python=2.7.15=h5a48372_1011_cpython - python_abi=2.7=1_cp27mu - - rdma-core=56.0=h5888daf_0 - - readline=8.2=h8c095d6_2 - - samtools=1.18=hd87286a_0 + - rdma-core=55.0=h5888daf_0 + - readline=8.1=h46c0cb4_0 + - samtools=1.12=h9aed4be_1 - setuptools=44.0.0=py27_0 - sortedcontainers=2.4.0=pyhd8ed1ab_0 - - sqlite=3.46.0=h6d4b2fc_0 + - sqlite=3.37.0=h9cd32fc_0 - tabix=1.11=hdfd78af_0 - tk=8.6.13=noxft_h4845f30_101 - - ucx=1.18.0=hfd9a62f_1 + - ucx=1.18.0=hfd9a62f_3 - wheel=0.37.1=pyhd8ed1ab_0 - xz=5.2.6=h166bdaf_0 - zlib=1.2.13=h4ab18f5_6 - zstd=1.5.6=ha6fb4c9_0 - + \ No newline at end of file diff --git a/lib/Rain_utilities.groovy b/lib/Rain_utilities.groovy new file mode 100644 index 0000000..d692afa --- /dev/null +++ b/lib/Rain_utilities.groovy @@ -0,0 +1,24 @@ +// dedicated to the utility functions +class Rain_utilities { + + // Function to chek if we + public static Boolean is_url(file) { + + // check if the file is a URL + if (file =~ /^(http|https|ftp|s3|az|gs):\/\/.*/) { + return true + } else { + return false + } + } + // Function to chek if we + public static Boolean is_fastq(file) { + + // check if the file is a URL + if (file =~ /.*(fq|fastq|fq.gz|fastq.gz)$/) { + return true + } else { + return false + } + } +} \ No newline at end of file diff --git a/modules/aline.nf b/modules/aline.nf index be6eed3..73e9f8d 100644 --- a/modules/aline.nf +++ b/modules/aline.nf @@ -30,7 +30,7 @@ process AliNe { profile, config, reads, - "--genome ${genome}", + "--reference ${genome}", read_type, aligner, library_type, diff --git a/modules/bamutil.nf b/modules/bamutil.nf index 17cc7c9..723ea29 100644 --- a/modules/bamutil.nf +++ b/modules/bamutil.nf @@ -1,18 +1,19 @@ process bamutil_clipoverlap { label 'bamutil' - tag "$sample_id" + tag "${meta.id}" publishDir "${params.outdir}/bamutil_clipoverlap", mode: 'copy' input: - tuple val(sample_id), path(bam) + tuple val(meta), path(bam) output: - tuple val(sample_id), path ("*clipoverlap.bam"), emit: tuple_sample_clipoverbam + tuple val(meta), path ("*clipoverlap.bam"), emit: tuple_sample_clipoverbam path ("*_bamutil_clipoverlap.log"), emit: log script: - """ - bam clipOverlap --storeOrig CG --poolSize 50000000 --in ${bam} --out ${bam}_clipoverlap.bam --stats > ${sample_id}_bamutil_clipoverlap.log - """ + + """ + bam clipOverlap --storeOrig CG --poolSize 50000000 --in ${bam} --out ${bam}_clipoverlap.bam --stats > ${meta.id}_bamutil_clipoverlap.log + """ } diff --git a/modules/bash.nf b/modules/bash.nf new file mode 100644 index 0000000..e877bf6 --- /dev/null +++ b/modules/bash.nf @@ -0,0 +1,21 @@ +/* +Here are described all processes related to bash +*/ + +// A process to compute the mean read length of a FASTQ +process extract_libtype { + label 'bash' + tag "$id" + + input: + tuple val(id), path(samlmon_json) + + output: + tuple val(id), env(LIBTYPE), emit: tuple_id_libtype + + script: + """ + LIBTYPE=\$(grep expected_format ${samlmon_json} | awk '{print \$2}' | tr -d '",\n') + """ + +} \ No newline at end of file diff --git a/modules/fastp.nf b/modules/fastp.nf index 2f48aeb..74401da 100644 --- a/modules/fastp.nf +++ b/modules/fastp.nf @@ -1,49 +1,57 @@ process fastp { label 'fastp' - publishDir "${params.output}/${id}/qc", mode: 'copy' + tag "${meta.id}" + publishDir "${params.output}/${meta.id}/qc", mode: 'copy' + input: - tuple val(id), path(illumina) + tuple val(meta), path(illumina) val(phred_type) + output: - tuple val(id), path("*_R?_clean.fastq") - path("${id}_fastp_report.html") + tuple val(meta), path("*_R?_clean.fastq") + path("${meta.id}_fastp_report.html") + script: """ if [ "${phred_type}" == "64" ] then fastp -i ${illumina[0]} -I ${illumina[1]} \ - -o ${id}_R1_clean.fastq -O ${id}_R2_clean.fastq \ + -o ${meta.id}_R1_clean.fastq -O ${meta.id}_R2_clean.fastq \ --phred64 \ - --detect_adapter_for_pe --html ${id}_fastp_report.html + --detect_adapter_for_pe --html ${meta.id}_fastp_report.html else fastp -i ${illumina[0]} -I ${illumina[1]} \ - -o ${id}_R1_clean.fastq -O ${id}_R2_clean.fastq \ - --detect_adapter_for_pe --html ${id}_fastp_report.html + -o ${meta.id}_R1_clean.fastq -O ${meta.id}_R2_clean.fastq \ + --detect_adapter_for_pe --html ${meta.id}_fastp_report.html fi """ } process fastp_hybrid { label 'fastp' + tag "${meta.id}" publishDir "${params.output}/${id}/qc", mode: 'copy' + input: - tuple val(id), path(illuminaR1), path(illuminaR2), path(ont) + tuple val(meta), path(illuminaR1), path(illuminaR2), path(ont) val(phred_type) + output: - tuple val(id), path("${id}_R1_clean.fastq"),path("${id}_R2_clean.fastq"), path(ont), emit: trimmed_hybrid - path("${id}_fastp_report.html") + tuple val(meta), path("${meta.id}_R1_clean.fastq"),path("${meta.id}_R2_clean.fastq"), path(ont), emit: trimmed_hybrid + path("${meta.id}_fastp_report.html") + script: """ if [ "${phred_type}" == "64" ] then fastp -i ${illuminaR1} -I ${illuminaR2} \ - -o ${id}_R1_clean.fastq -O ${id}_R2_clean.fastq \ + -o ${meta.id}_R1_clean.fastq -O ${meta.id}_R2_clean.fastq \ --phred64 \ - --detect_adapter_for_pe --html ${id}_fastp_report.html + --detect_adapter_for_pe --html ${meta.id}_fastp_report.html else fastp -i ${illuminaR1} -I ${illuminaR2} \ - -o ${id}_R1_clean.fastq -O ${id}_R2_clean.fastq \ - --detect_adapter_for_pe --html ${id}_fastp_report.html + -o ${meta.id}_R1_clean.fastq -O ${meta.id}_R2_clean.fastq \ + --detect_adapter_for_pe --html ${meta.id}_fastp_report.html fi """ } diff --git a/modules/fastqc.nf b/modules/fastqc.nf index 9aa5518..b83e6bb 100644 --- a/modules/fastqc.nf +++ b/modules/fastqc.nf @@ -1,19 +1,19 @@ process fastqc { label 'fastqc' - tag "$sample_id" + tag "${meta.id}" publishDir "${params.outdir}/FastQC", mode: 'copy' input: - tuple val(sample_id), path(reads) - val (suffix) + tuple val(meta), path(reads) + val (suffix) output: - path ("fastqc_${sample_id}_logs_${suffix}") + path ("fastqc_${meta.id}_logs_${suffix}") script: - """ - mkdir fastqc_${sample_id}_logs_${suffix} - fastqc -t ${task.cpus} -o fastqc_${sample_id}_logs_${suffix} -q ${reads} - """ + """ + mkdir fastqc_${meta.id}_logs_${suffix} + fastqc -t ${task.cpus} -o fastqc_${meta.id}_logs_${suffix} -q ${reads} + """ } diff --git a/modules/gatk.nf b/modules/gatk.nf index 32d25ff..c6cb602 100644 --- a/modules/gatk.nf +++ b/modules/gatk.nf @@ -1,22 +1,22 @@ process gatk_markduplicates { label 'gatk' - tag "$sample_id" + tag "${meta.id}" publishDir "${params.outdir}/gatk_markduplicates", mode: 'copy' input: - tuple val(sample_id), path(bam) + tuple val(meta), path(bam) output: - tuple val(sample_id), path ("*_marked_duplicates.bam"), emit: tuple_sample_dedupbam - path ("*_marked_dup_metrics.txt") , emit: log + tuple val(meta), path ("*_marked_duplicates.bam"), emit: tuple_sample_dedupbam + path ("*_marked_dup_metrics.txt") , emit: log script: - """ - gatk MarkDuplicates \ - -I ${bam} \ - -O ${bam.baseName}_marked_duplicates.bam \ - -M ${bam.baseName}_marked_dup_metrics.txt \ - --REMOVE_DUPLICATES - """ + """ + gatk MarkDuplicates \ + -I ${bam} \ + -O ${bam.baseName}_marked_duplicates.bam \ + -M ${bam.baseName}_marked_dup_metrics.txt \ + --REMOVE_DUPLICATES + """ } diff --git a/modules/jacusa2.nf b/modules/jacusa2.nf index 12ad31f..e789f87 100644 --- a/modules/jacusa2.nf +++ b/modules/jacusa2.nf @@ -1,18 +1,19 @@ process jacusa2 { label "jacusa2" + tag "${meta.id}" publishDir("${params.outdir}/jacusa2", mode: "copy") input: - tuple(val(sample), path(bam), path(bamindex)) - tuple(path(genome), path(fastaindex)) + tuple(val(meta), path(bam), path(bamindex)) + tuple(path(genome), path(fastaindex)) output: - path("*.txt") - path("*.filtered") - // tuple(sample, path("filtered_output.txt", emit: tuple_sample_jacusa2_table)) + path("*.txt") + path("*.filtered") + // tuple(sample, path("filtered_output.txt", emit: tuple_sample_jacusa2_table)) script: - """ - java -Xmx${task.memory.toMega()}M -jar /usr/local/bin/JACUSA_v2.0.4.jar call-1 -a D -f V -p ${task.cpus} -r jacusa_out.txt -c 1 -s -R ${genome} ${bam} - """ + """ + java -Xmx${task.memory.toMega()}M -jar /usr/local/bin/JACUSA_v2.0.4.jar call-1 -a D -f V -p ${task.cpus} -r jacusa_out.txt -c 1 -s -R ${genome} ${bam} + """ } \ No newline at end of file diff --git a/modules/multiqc.nf b/modules/multiqc.nf index bdeab5b..2d9c59a 100644 --- a/modules/multiqc.nf +++ b/modules/multiqc.nf @@ -3,15 +3,15 @@ process multiqc { publishDir "${params.outdir}/MultiQC", mode: 'copy' input: - path log_files - path multiqc_config + path log_files + path multiqc_config output: - path "*multiqc_report.html" - path "*_data" + path "*multiqc_report.html" + path "*_data" script: - """ - multiqc -p . -c ${multiqc_config} - """ + """ + multiqc -p . -c ${multiqc_config} + """ } \ No newline at end of file diff --git a/modules/reditools2.nf b/modules/reditools2.nf index f052544..e3b19b8 100644 --- a/modules/reditools2.nf +++ b/modules/reditools2.nf @@ -1,17 +1,36 @@ process reditools2 { label "reditools2" - publishDir("${params.outdir}/edit_tables", mode: "copy") + publishDir("${params.outdir}/reditools", mode: "copy") + tag "${meta.id}" input: - tuple(val(sample), path(bam), path(bamindex)) - path genome - val region + tuple(val(meta), path(bam), path(bamindex)) + path genome + val region output: - tuple(val(sample), path("serial_table.txt"), emit: tuple_sample_serial_table) + tuple(val(meta), path("edit_table.txt"), emit: tuple_sample_serial_table) script: + + // Set the strand orientation parameter from the library type parameter + // Terms explained in https://salmon.readthedocs.io/en/latest/library_type.html + if (meta.libtype in ["ISR", "SR"]) { + // First-strand oriented + strand_orientation = "2" + } else if (meta.libtype in ["ISF", "SF"]) { + // Second-strand oriented + strand_orientation = "1" + } else if (meta.libtype in ["IU", "U"]) { + // Unstranded + strand_orientation = "0" + } else { + // Unsupported: Pass the library type string so that it's reported in + // the reditools error message + strand_orientation = meta.libtype + } + """ - reditools.py -f ${bam} -r ${genome} -S -o serial_table.txt + reditools.py -f ${bam} -r ${genome} -s ${strand_orientation} -o edit_table.txt """ } diff --git a/modules/samtools.nf b/modules/samtools.nf index aecdf40..86cc39a 100644 --- a/modules/samtools.nf +++ b/modules/samtools.nf @@ -1,13 +1,13 @@ process samtools_index { label "samtools" - tag "$sample" + tag "${meta.id}" publishDir("${params.outdir}/bam_indices", mode:"copy") input: - tuple(val(sample), path(bam)) + tuple(val(meta), path(bam)) output: - tuple(val(sample), path(bam), path("*.bai"), emit: tuple_sample_bam_bamindex) + tuple(val(meta), path(bam), path("*.bai"), emit: tuple_sample_bam_bamindex) script: """ @@ -17,17 +17,44 @@ process samtools_index { process samtools_fasta_index { label "samtools" - tag "genome" + tag "${fasta}" publishDir("${params.outdir}/fasta_indices", mode:"copy") input: - path(fasta) + path(fasta) output: - tuple(path(fasta), path("${fasta}.fai"), emit: tuple_fasta_fastaindex) + tuple(path(fasta), path("${fasta}.fai"), emit: tuple_fasta_fastaindex) script: - """ - samtools faidx ${fasta} - """ + """ + samtools faidx ${fasta} + """ } + +/* +http://www.htslib.org/doc/samtools-sort.html +Sort alignments by leftmost coordinates +*/ +process samtools_sort_bam { + label "samtools" + tag "${meta.id}" + + input: + tuple val(meta), path(bam) + + output: + tuple val(meta), path ("*_sorted.bam"), emit: tuple_sample_sortedbam + + script: + + """ + if [[ \$(samtools view -H ${bam}| awk '/^@HD/ { for(i=1;i<=NF;i++) if(\$i ~ /^SO:/) print \$i }') == *"coordinate"* ]]; then + echo "Already sorted by coordinate" + ln -s ${bam} ${bam.baseName}_sorted.bam + else + echo "Not sorted by coordinate" + samtools sort -@ ${task.cpus} ${bam} -o ${bam.baseName}_sorted.bam + fi + """ +} \ No newline at end of file diff --git a/modules/sapin.nf b/modules/sapin.nf index 9b6dec2..fe9f57a 100644 --- a/modules/sapin.nf +++ b/modules/sapin.nf @@ -1,16 +1,17 @@ process sapin { label "sapin" + tag "${meta.id}" publishDir("${params.outdir}/sapin", mode: "copy") input: - tuple(val(sample), path(bam)) - path(reference) + tuple(val(meta), path(bam)) + path(reference) output: - path("restable.tsv") + path("restable.tsv") script: - """ - sapin -a ${bam} -f ${reference} > restable.tsv - """ + """ + sapin -a ${bam} -f ${reference} > restable.tsv + """ } diff --git a/rain.nf b/rain.nf index c75dc4b..f8efe50 100644 --- a/rain.nf +++ b/rain.nf @@ -10,13 +10,12 @@ import java.nio.file.* //************************************************* // Input/output params -params.reads = "/path/to/reads_{1,2}.fastq.gz/or/folder" -params.genome = "/path/to/genome.fa" -params.annotation = "/path/to/annotations.gff3" -params.outdir = "result" -params.reads_extension = ".fastq.gz" // Extension used to detect reads in folder -params.paired_reads_pattern = "_{1,2}" - +params.reads = null // "/path/to/reads_{1,2}.fastq.gz/or/folder" +params.bam = null // "/path/to/reads.bam/or/folder" +params.csv = null // "/path/to/reads.bam/or/folder" +params.genome = null // "/path/to/genome.fa" +params.annotation = null // "/path/to/annotations.gff3" +params.outdir = "result" /* Specific AliNe params (some are shared with RAIN)*/ @@ -24,6 +23,8 @@ params.paired_reads_pattern = "_{1,2}" read_type_allowed = [ 'short_paired', 'short_single', 'pacbio', 'ont' ] params.read_type = "short_paired" // short_paired, short_single, pacbio, ont params.library_type = "auto" // can be 'U', 'IU', 'MU', 'OU', 'ISF', 'ISR', 'MSF', 'MSR', 'OSF', 'OSR', 'auto' - see https://github.com/Juke34/AliNe for more information +params.bam_library_type = null // can be 'U', 'IU', 'MU', 'OU', 'ISF', 'ISR', 'MSF', 'MSR', 'OSF', 'OSR', 'auto' - see https://github.com/Juke34/AliNe for more information + // Aline profiles aline_profile_allowed = [ 'docker', 'singularity', 'local', 'itrop' ] @@ -43,58 +44,47 @@ params.region = "" // e.g. chr21 - Used to limit the analysis to a specific regi // Report params params.multiqc_config = "$baseDir/config/multiqc_conf.yml" +// other +params.help = null +params.monochrome_logs = false // if true, no color in logs + //************************************************* -// STEP 1 - LOG INFO +// STEP 1 - HELP //************************************************* -// Header message -log.info """ -IRD -.-./`) .-------. ______ -\\ .-.')| _ _ \\ | _ `''. -/ `-' \\| ( ' ) | | _ | ) _ \\ - `-'`\"`|(_ o _) / |( ''_' ) | - .---. | (_,_).' __ | . (_) `. | - | | | |\\ \\ | ||(_ ._) ' - | | | | \\ `' /| (_.\\.' / - | | | | \\ / | .' - '---' ''-' `'-' '-----'` - - -RAIN - RNA Alterations Investigation using Nextflow -=================================================== - -""" - +log.info header() if (params.help) { exit 0, helpMSG() } // Help Message def helpMSG() { log.info """ - ********* RAIN - RNA Alterations Investigation using Nextflow ********* + RAIN - RNA Alterations Investigation using Nextflow - v${workflow.manifest.version} Usage example: - nextflow run main.nf --illumina short_reads_Ecoli --genus Escherichia --species coli --species_taxid 562 -profile docker -resume - --help prints the help section + nextflow run rain.nf -profile docker --genome /path/to/genome.fa --annotation /path/to/annotation.gff3 --reads /path/to/reads_folder --output /path/to/output --aligner hisat2 - Input sequences: - --reads path to the illumina read file (fastq or fastq.gz) (default: $params.reads) - --genome path to the genome (default: $params.genome) + Parameters: + --help Prints the help section - Annotation input: - --annotation path to a GFF3 file with annotations of genomic features + Input sequences: + --annotation Path to the annotation file (GFF or GTF) + --bam Path to the bam file or folder. + --csv Path to the csv file containing fastq/bam files and their metadata. The csv file must contain a 'sample' column with the sample name, a 'input_1' column with the path to the bam file/or fastq1 file, a 'input_2' column with the path to R2 fastq file in case of paired data (either empty), and a 'strandedness' column with the library type (e.g. U, IU, MU, OU, ISF, ISR, MSF, MSR, OSF, OSR). + --reads Path to the reads file or folder. If a folder is provided, all the files with proper extension (.fq, .fastq, .fq.gz, .fastq.gz) in the folder will be used. You can provide remote files (commma separated list). + --genome Path to the reference genome in FASTA format. Output: - --output path to the output directory (default: $params.outdir) + --output Path to the output directory (default: $params.outdir) Optional input: - --aligner Aligner to use [default: $params.aligner] - --read_type type of reads among this list ${read_type_allowed} (default: short_paired) - --reads_extension Extension of the read files [default: $params.reads_extension] + --aligner Aligner to use [default: $params.aligner] + --read_type Type of reads among this list ${read_type_allowed} (default: short_paired) + --library_type Set the library_type of your fastq reads (default: auto). In auto mode salmon will guess the library type for each sample. [ 'U', 'IU', 'MU', 'OU', 'ISF', 'ISR', 'MSF', 'MSR', 'OSF', 'OSR', 'auto' ] + --bam_library_type Set the library_type of your BAM reads (default: null). When using BAM you must set a library type: [ 'U', 'IU', 'MU', 'OU', 'ISF', 'ISR', 'MSF', 'MSR', 'OSF', 'OSR', 'auto' ] + Nextflow options: - -profile change the profile of nextflow both the engine and executor more details on github README [debug, test, itrop, singularity, local, docker] - -resume resume the workflow where it stopped + -profile Change the profile of nextflow both the engine and executor more details on github README [debug, test, itrop, singularity, local, docker] """ } @@ -103,22 +93,24 @@ def helpMSG() { log.info """ General Parameters - genome : ${params.genome} - annotation : ${params.annotation} - reads : ${params.reads} - read_type : ${params.read_type} - outdir : ${params.outdir} + annotation : ${params.annotation} + bam : ${params.bam} + csv : ${params.csv} + reads : ${params.reads} + genome : ${params.genome} + + reads_library_type : ${params.library_type} + bam_library_type : ${params.bam_library_type} + outdir : ${params.outdir} Alignment Parameters - aline_profiles : ${params.aline_profiles} - aligner : ${params.aligner} - library_type : ${params.library_type} - paired_reads_pattern : ${params.paired_reads_pattern} - reads_extension : ${params.reads_extension} + aline_profiles : ${params.aline_profiles} + aligner : ${params.aligner} + reads_library_type : ${params.library_type} Report Parameters MultiQC parameters - multiqc_config : ${params.multiqc_config} + multiqc_config : ${params.multiqc_config} """ @@ -127,13 +119,14 @@ Report Parameters // STEP 2 - Include needed modules //************************************************* include { AliNe as ALIGNMENT } from "./modules/aline.nf" +include { extract_libtype } from "./modules/bash.nf" include {bamutil_clipoverlap} from './modules/bamutil.nf' include {fastp} from './modules/fastp.nf' include {fastqc as fastqc_raw; fastqc as fastqc_ali; fastqc as fastqc_dup; fastqc as fastqc_clip} from './modules/fastqc.nf' include {gatk_markduplicates } from './modules/gatk.nf' include {multiqc} from './modules/multiqc.nf' include {fasta_uncompress} from "$baseDir/modules/pigz.nf" -include {samtools_index; samtools_fasta_index} from './modules/samtools.nf' +include {samtools_index; samtools_fasta_index; samtools_sort_bam} from './modules/samtools.nf' include {reditools2} from "./modules/reditools2.nf" include {jacusa2} from "./modules/jacusa2.nf" include {sapin} from "./modules/sapin.nf" @@ -185,37 +178,229 @@ def aline_profile = aline_profile_list.join(',') workflow { main: - - // check if genome exists +// ---------------------------------------------------------------------------- + // --- DEAL WITH REFERENCE --- + // check if reference exists Channel.fromPath(params.genome, checkIfExists: true) - .ifEmpty { exit 1, "Cannot find genome matching ${params.genome}!\n" } - .set{genome_raw} + .ifEmpty { exit 1, "Cannot find genome matching ${params.genome}!\n" } + .set{genome_raw} // uncompress it if needed fasta_uncompress(genome_raw) fasta_uncompress.out.genomeFa.set{genome_ch} // set genome to the output of fasta_uncompress +// ---------------------------------------------------------------------------- + // --- DEAL WITH ANNOTATION --- + Channel.empty().set{annotation_ch} + if (params.annotation){ + Channel.fromPath(params.annotation, checkIfExists: true) + .ifEmpty { exit 1, "Cannot find annotation matching ${params.annotation}!\n" } + .set{annotation_ch} + } +// ---------------------------------------------------------------------------- + def path_csv = params.csv + def path_bam = params.bam + def path_reads = params.reads + def has_fastq = false + // ---------------------- DEAL WITH BAM FILES ---------------------- + Channel.empty().set{sorted_bam} + if ( path_bam || path_csv ) { + + def bam_list=[] + def via_url = false + def via_csv = false + + if ( path_csv && path_csv.endsWith('.csv') ){ + log.info "Using CSV input file: ${path_csv}" + // --------- BAM CSV CASE --------- + via_csv = true + File input_csv = new File(path_csv) + if(!input_csv.exists()){ + error "The input ${params.csv} file does not exist!\n" + } + + bams = Channel.fromPath(params.csv) + .splitCsv(header: true, sep: ',') + .map { row -> + if(row.sample == null || row.sample.trim() == ''){ + error "The input ${params.csv} file does not contain a 'sample' column!\n" + } + def sample_id = row.sample + if(row.input_1 == null || row.input_1.trim() == ''){ + error "The input ${params.csv} file does not contain a 'input_1' column!\n" + } + if( row.input_1.toString().endsWith('.bam') ){ + def input_1 = file(row.input_1.trim()) + + if (! Rain_utilities.is_url(input_1) ) { + if (! input_1.exists() ) { + error "The input ${input_1} file does not does not exits!\n" + } + } else { + log.info "This bam input is an URL: ${input_1}" + } + + if(row.strandedness == null || row.strandedness.trim() == ''){ + error "The input ${params.csv} file does not contain a 'strandedness' column!\n" + } + def libtype = row.strandedness ?: 'U' + def meta = [ id: sample_id, libtype: libtype ] + return tuple(meta, input_1) + } + else if ( Rain_utilities.is_fastq(row.input_1.trim()) ) { + has_fastq = true + /* def input_1 = file(row.input_1.trim()) + if (! Rain_utilities.is_url(input_1) ) { + if (! input_1.exists() ) { + error "The input ${input_1} file does not does not exits!\n" + } + } else { + log.info "This fastq input is an URL: ${input_1}" + } + */ + } + else { + error "The input ${row.input_1} file is not a BAM or FASTQ file!\n" + } + } + } + else { + + // --------- BAM LIST CASE --------- + if( path_bam.indexOf(',') >= 0) { + // Cut into list with coma separator + str_list = path_bam.tokenize(',') + // loop over elements + str_list.each { + str_list2 = it.tokenize(' ') + str_list2.each { + if ( it.endsWith('.bam') ){ + if ( Rain_utilities.is_url(it) ) { + log.info "This bam input is an URL: ${it}" + via_url = true + } + bam_list.add(file(it)) // use file insted of File for URL + } + } + } + } + else { + // --------- BAM FOLDER CASE --------- + File input_reads = new File(path_bam) + if(input_reads.exists()){ + if ( input_reads.isDirectory()) { + log.info "The input ${path_bam} is a folder!" + // in case of folder provided, add a trailing slash if missing + path_bam = "${input_reads}" + "/*.bam" + } + // --------- BAM FILE CASE --------- + else { + if ( path_bam.endsWith('.bam') ){ + log.info "The input ${path_bam} is a bam file!" + if ( Rain_utilities.is_url(path_bam) ) { + log.info "This bam input is an URL: ${path_bam}" + via_url = true + } + } + } + } + } + } + + // Add lib + if (! via_csv) { + if (via_url ){ + my_samples = Channel.of(bam_list) + bams = my_samples.flatten().map { it -> [it.name.split('_')[0], it] } + .groupTuple() + .ifEmpty { exit 1, "Cannot find reads matching ${path_reads}!\n" } + } else { + bams = Channel.fromFilePairs(path_bam, size: 1, checkIfExists: true) + .ifEmpty { exit 1, "Cannot find reads matching ${path_bam}!\n" } + } + + // check if bam library type is provided when bam files are used + bams.collect().map { list -> + if (!list.isEmpty() && !params.bam_library_type ) { + exit 1, "⚠️ When using BAM files, the library type must be provided. Please use the parameter --bam_library_type .\n" + } + } + // Set structure with dictionary as first value + bams = bams.map { id, bam_file -> + def meta = [ id: id, libtype: params.bam_library_type ] + tuple(meta, bam_file) + } + } + // sort the bam files + sorted_bam = samtools_sort_bam( bams ) + } +// ---------------------------------------------------------------------------- + // DEAL WITH FASTQ FILES + // Perform AliNe alignment + Channel.empty().set{aline_alignments_all} + if (path_reads || ( path_csv && has_fastq) ) { + + if ( path_csv && path_csv.endsWith('.csv') ){ + path_reads = path_csv + } + ALIGNMENT ( + 'Juke34/AliNe -r v1.4.0', // Select pipeline + "${workflow.resume?'-resume':''} -profile ${aline_profile}", // workflow opts supplied as params for flexibility + "-config ${params.aline_profiles}", + "--reads ${path_reads}", + genome_ch, + "--read_type ${params.read_type}", + "--aligner ${params.aligner}", + "--library_type ${params.library_type}", + workflow.workDir.resolve('Juke34/AliNe').toUriString() + ) + + // GET TUPLE [ID, BAM] FILES + ALIGNMENT.out.output + .map { dir -> + files("$dir/alignment/*/*.bam", checkIfExists: true) // Find BAM files inside the output directory + } + .flatten() // Ensure we emit each file separately + .map { bam -> + def name = bam.getName().split('_seqkit')[0] // Extract the base name of the BAM file. _seqkit is the separator. + tuple(name, bam) + } // Convert each BAM file into a tuple, with the base name as the first element + .set { aline_alignments } // Store the channel + + if (params.library_type.contains("auto") ) { + log.info "Library type is set to auto, extracting it from salmon output" + // GET TUPLE [ID, OUTPUT_SALMON_LIBTYPE] FILES + ALIGNMENT.out.output + .map { dir -> + files("$dir/salmon_libtype/*/*.json", checkIfExists: true) // Find BAM files inside the output directory + } + .flatten() // Ensure we emit each file separately + .map { json -> + def name = json.getParent().getName().split('_seqkit')[0] // Extract the base name of the BAM file. _seqkit is the separator. The name is in the fodler containing the json file. Why take this one? Because it is the same as teh bam name set by Aline. It will be used to sync both values + tuple(name, json) + } // Convert each BAM file into a tuple, with the base name as the first element + .set { aline_libtype } // Store the channel + // Extract the library type from the JSON file + aline_libtype = extract_libtype(aline_libtype) + aline_alignments.join(aline_libtype) + .map { key, val1, val2 -> tuple(key, val1, val2) } + .set { aline_alignments_all } + } else { + log.info "Library type is set to ${params.library_type}, no need to extract it from salmon output" + aline_alignments_all = aline_alignments.map { name, bam -> tuple(name, bam, params.library_type) } + } - ALIGNMENT ( - 'Juke34/AliNe -r v1.3.0', // Select pipeline - "${workflow.resume?'-resume':''} -profile ${aline_profile}", // workflow opts supplied as params for flexibility - "-config ${params.aline_profiles}", - "--reads ${params.reads}", - genome_ch, - "--read_type ${params.read_type}", - "--aligner ${params.aligner}", - "--library_type ${params.library_type}", - workflow.workDir.resolve('Juke34/AliNe').toUriString() - ) - ALIGNMENT.out.output - .map { dir -> - files("$dir/alignment/*/*.bam", checkIfExists: true) // Find BAM files inside the output directory + // transform [ID, BAM, LIBTYPE] into [[id: 'ID', libtype: 'LIBTYPE'], file('BAM')] + aline_alignments_all = aline_alignments_all.map { id, file, lib -> + def meta = [ id: id, libtype: lib ] + tuple(meta, file) } - .flatten() // Ensure we emit each file separately - .map { bam -> tuple(bam.baseName, bam) } // Convert each BAM file into a tuple, with the base name as the first element - .set { aline_alignments } // Store the channel + } - // aline_alignments.view() - rain(aline_alignments, genome_ch) + // call rain + all_bams = aline_alignments_all.mix(sorted_bam) + log.info "The following bam file(s) will be processed by RAIN:" + all_bams.view() + rain(all_bams, genome_ch, annotation_ch) } workflow rain { @@ -223,6 +408,7 @@ workflow rain { take: tuple_sample_sortedbam genome + annnotation main: @@ -253,5 +439,69 @@ workflow rain { samtools_fasta_index(genome) jacusa2(samtools_index.out.tuple_sample_bam_bamindex, samtools_fasta_index.out.tuple_fasta_fastaindex) sapin(bamutil_clipoverlap.out.tuple_sample_clipoverbam, genome) - normalize_gxf(params.annotation) + normalize_gxf(annnotation) +} + + +//************************************************* +def header(){ + // Log colors ANSI codes + c_reset = params.monochrome_logs ? '' : "\033[0m"; + c_dim = params.monochrome_logs ? '' : "\033[2m"; + c_black = params.monochrome_logs ? '' : "\033[0;30m"; + c_green = params.monochrome_logs ? '' : "\033[0;32m"; + c_yellow = params.monochrome_logs ? '' : "\033[0;33m"; + c_blue = params.monochrome_logs ? '' : "\033[0;34m"; + c_purple = params.monochrome_logs ? '' : "\033[0;35m"; + c_cyan = params.monochrome_logs ? '' : "\033[0;36m"; + c_white = params.monochrome_logs ? '' : "\033[0;37m"; + c_red = params.monochrome_logs ? '' : "\033[0;31m"; + + return """ + -${c_dim}--------------------------------------------------${c_reset}- + ${c_blue}.-./`) ${c_white}.-------. ${c_red} ______${c_reset} + ${c_blue}\\ .-.')${c_white}| _ _ \\ ${c_red} | _ `''.${c_reset} French National + ${c_blue}/ `-' \\${c_white}| ( ' ) | ${c_red} | _ | ) _ \\${c_reset} + ${c_blue} `-'`\"`${c_white}|(_ o _) / ${c_red} |( ''_' ) |${c_reset} Research Institute for + ${c_blue} .---. ${c_white}| (_,_).' __ ${c_red}| . (_) `. |${c_reset} + ${c_blue} | | ${c_white}| |\\ \\ | |${c_red}|(_ ._) '${c_reset} Sustainable Development + ${c_blue} | | ${c_white}| | \\ `' /${c_red}| (_.\\.' /${c_reset} + ${c_blue} | | ${c_white}| | \\ / ${c_red}| .'${c_reset} + ${c_blue} '---' ${c_white}''-' `'-' ${c_red}'-----'`${c_reset} + ${c_purple} RAIN - RNA Alterations Investigation using Nextflow - v${workflow.manifest.version}${c_reset} + -${c_dim}--------------------------------------------------${c_reset}- + """.stripIndent() +} + +/************** onComplete ***************/ + +workflow.onComplete { + + // Log colors ANSI codes + c_reset = params.monochrome_logs ? '' : "\033[0m"; + c_green = params.monochrome_logs ? '' : "\033[0;32m"; + c_red = params.monochrome_logs ? '' : "\033[0;31m"; + + if (workflow.success) { + log.info "\n${c_green} RAIN pipeline complete!" + } else { + log.error "${c_red}Oops .. something went wrong}" + } + + log.info " The results are available in the ‘${params.outdir}’ directory." + + def dateComplete = workflow.complete.format("dd-MMM-yyyy HH:mm:ss") + def duration = workflow.duration + def succeeded = workflow.stats['succeededCount'] ?: 0 + def cached = workflow.stats['cachedCount'] ?: 0 + log.info """ + ================ RAIN Pipeline Summary ================ + Completed at: ${dateComplete} + UUID : ${workflow.sessionId} + Duration : ${duration} + Succeeded : ${succeeded} + Cached : ${cached} + ======================================================= + ${c_reset} + """ } \ No newline at end of file