diff --git a/bin/restore_sequences.py b/bin/restore_sequences.py new file mode 100755 index 0000000..de0c3bf --- /dev/null +++ b/bin/restore_sequences.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 +""" +Restore original sequences in BAM files from original unmapped BAM. +This tool replaces modified sequences with their original counterparts. +""" + +import sys +import argparse +import pysam + + +def fix_bam(bam_aligned, bam_unmapped, bam_out): + """Restore original sequences using indexed lookup.""" + # Create index for fast read name lookup + unmapped = pysam.AlignmentFile(bam_unmapped, "rb") + unmapped_index = pysam.IndexedReads(unmapped) + unmapped_index.build() + + with pysam.AlignmentFile(bam_aligned, "rb") as bamfile: + with pysam.Samfile(bam_out, 'wb', template=bamfile) as out: + for read in bamfile: + try: + # Fetch original read by name (returns iterator) + for orig_read in unmapped_index.find(read.query_name): + read.query_sequence = orig_read.query_sequence + read.query_qualities = orig_read.query_qualities + read.set_tag("MD", None) + break # Take first match + out.write(read) + except KeyError: + pass # Read not found in unmapped BAM + + unmapped.close() + + +def main(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("-b", "--bam", required=True, help="aligned BAM file") + parser.add_argument("-u", "--unmapped", required=True, help="original unmapped BAM file") + parser.add_argument("-o", "--output", required=True, help="output BAM file") + args = parser.parse_args() + + fix_bam(args.bam, args.unmapped, args.output) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/build_containers.sh b/build_containers.sh index cb27651..beb9e6c 100755 --- a/build_containers.sh +++ b/build_containers.sh @@ -115,7 +115,7 @@ if [ "$build_docker" = true ]; then # List of image names image_list=( ) - for dir in docker/*; do + for dir in containers/docker/*; do cd "${dir}" imgname=$(echo $dir | rev | cut -d/ -f1 | rev) image_list+=(${imgname}) @@ -161,7 +161,7 @@ if [ "$build_singularity" = true ]; then sif_list=( ) # Loop through all .def files in singularity/ subdirectories - for def_file in singularity/*/*.def; do + for def_file in containers/singularity/*/*.def; do # Extract image name from directory imgname=$(basename "$(dirname "$def_file")") sif_output="sif_images/${imgname}.sif" diff --git a/config/softwares.config b/config/softwares.config index 70c204e..f0dc0d1 100644 --- a/config/softwares.config +++ b/config/softwares.config @@ -46,7 +46,7 @@ process { container = 'quay.io/biocontainers/star:2.7.10b--h6b7c446_1' } withLabel: 'samtools' { - container = 'quay.io/biocontainers/samtools:1.3.1--h0cf4675_11' + container = 'quay.io/biocontainers/samtools:1.23--h96c455f_0' } withLabel: "sapin" { container = singularity.enabled ? "${params.sifPath}/sapin.sif" : "sapin" diff --git a/docker/jacusa2/Dockerfile b/containers/docker/jacusa2/Dockerfile similarity index 100% rename from docker/jacusa2/Dockerfile rename to containers/docker/jacusa2/Dockerfile diff --git a/docker/pluviometer/Dockerfile b/containers/docker/pluviometer/Dockerfile similarity index 100% rename from docker/pluviometer/Dockerfile rename to containers/docker/pluviometer/Dockerfile diff --git a/docker/pluviometer/env_pluviometer.yml b/containers/docker/pluviometer/env_pluviometer.yml similarity index 98% rename from docker/pluviometer/env_pluviometer.yml rename to containers/docker/pluviometer/env_pluviometer.yml index 64f02b3..47b0cb0 100644 --- a/docker/pluviometer/env_pluviometer.yml +++ b/containers/docker/pluviometer/env_pluviometer.yml @@ -51,6 +51,7 @@ dependencies: - typing_extensions=4.13.2=pyh29332c3_0 - tzdata=2025b=h78e105d_0 - wheel=0.45.1=pyhd8ed1ab_1 + - pysam=0.23.3=py312h47d5410_1 - pip: - flameprof==0.4 prefix: /home/earx/miniforge3/envs/rain-stats-scripts diff --git a/docker/reditools2/Dockerfile b/containers/docker/reditools2/Dockerfile similarity index 100% rename from docker/reditools2/Dockerfile rename to containers/docker/reditools2/Dockerfile diff --git a/docker/reditools2/env_reditools2.yml b/containers/docker/reditools2/env_reditools2.yml similarity index 100% rename from docker/reditools2/env_reditools2.yml rename to containers/docker/reditools2/env_reditools2.yml diff --git a/docker/reditools3/Dockerfile b/containers/docker/reditools3/Dockerfile similarity index 100% rename from docker/reditools3/Dockerfile rename to containers/docker/reditools3/Dockerfile diff --git a/docker/sapin/Dockerfile b/containers/docker/sapin/Dockerfile similarity index 100% rename from docker/sapin/Dockerfile rename to containers/docker/sapin/Dockerfile diff --git a/docker/sapin/constraints.txt b/containers/docker/sapin/constraints.txt similarity index 100% rename from docker/sapin/constraints.txt rename to containers/docker/sapin/constraints.txt diff --git a/singularity/jacusa2/jacusa2.def b/containers/singularity/jacusa2/jacusa2.def similarity index 100% rename from singularity/jacusa2/jacusa2.def rename to containers/singularity/jacusa2/jacusa2.def diff --git a/singularity/pluviometer/env_pluviometer.yml b/containers/singularity/pluviometer/env_pluviometer.yml similarity index 98% rename from singularity/pluviometer/env_pluviometer.yml rename to containers/singularity/pluviometer/env_pluviometer.yml index 64f02b3..47b0cb0 100644 --- a/singularity/pluviometer/env_pluviometer.yml +++ b/containers/singularity/pluviometer/env_pluviometer.yml @@ -51,6 +51,7 @@ dependencies: - typing_extensions=4.13.2=pyh29332c3_0 - tzdata=2025b=h78e105d_0 - wheel=0.45.1=pyhd8ed1ab_1 + - pysam=0.23.3=py312h47d5410_1 - pip: - flameprof==0.4 prefix: /home/earx/miniforge3/envs/rain-stats-scripts diff --git a/singularity/pluviometer/pluviometer.def b/containers/singularity/pluviometer/pluviometer.def similarity index 100% rename from singularity/pluviometer/pluviometer.def rename to containers/singularity/pluviometer/pluviometer.def diff --git a/singularity/reditools2/entrypoint.sh b/containers/singularity/reditools2/entrypoint.sh similarity index 100% rename from singularity/reditools2/entrypoint.sh rename to containers/singularity/reditools2/entrypoint.sh diff --git a/singularity/reditools2/env_reditools2.yml b/containers/singularity/reditools2/env_reditools2.yml similarity index 100% rename from singularity/reditools2/env_reditools2.yml rename to containers/singularity/reditools2/env_reditools2.yml diff --git a/singularity/reditools2/reditools2.def b/containers/singularity/reditools2/reditools2.def similarity index 100% rename from singularity/reditools2/reditools2.def rename to containers/singularity/reditools2/reditools2.def diff --git a/singularity/reditools3/reditools3.def b/containers/singularity/reditools3/reditools3.def similarity index 100% rename from singularity/reditools3/reditools3.def rename to containers/singularity/reditools3/reditools3.def diff --git a/singularity/sapin/constraints.txt b/containers/singularity/sapin/constraints.txt similarity index 100% rename from singularity/sapin/constraints.txt rename to containers/singularity/sapin/constraints.txt diff --git a/singularity/sapin/sapin.def b/containers/singularity/sapin/sapin.def similarity index 100% rename from singularity/sapin/sapin.def rename to containers/singularity/sapin/sapin.def diff --git a/data/chr21/chr21_small.bam b/data/chr21/chr21sample2_small.bam similarity index 100% rename from data/chr21/chr21_small.bam rename to data/chr21/chr21sample2_small.bam diff --git a/modules/aline.nf b/modules/aline.nf index 3038bc8..1c4aa63 100644 --- a/modules/aline.nf +++ b/modules/aline.nf @@ -30,7 +30,7 @@ process AliNe { pipeline_name, profile, config, - reads, + "--reads ${reads}", "--reference ${genome}", read_type, aligner, diff --git a/modules/bash.nf b/modules/bash.nf index e877bf6..59e64b3 100644 --- a/modules/bash.nf +++ b/modules/bash.nf @@ -18,4 +18,69 @@ process extract_libtype { LIBTYPE=\$(grep expected_format ${samlmon_json} | awk '{print \$2}' | tr -d '",\n') """ +} + +/* + * Transform A to G bases in FASTQ reads for hyper-editing detection + * Converts all adenosine (A) nucleotides to guanosine (G) in sequence lines + * Handles both single-end and paired-end reads + */ +process transform_bases_fastq { + label 'bash' + tag "${meta.id}" + publishDir("${output_dir}", mode:"copy", pattern: "*_AtoG.fastq.gz") + + input: + tuple val(meta), path(fastq) // Can be single file or multiple files (R1, R2, singles) + val output_dir + + output: + tuple val(meta), path("*_AtoG.fastq.gz"), emit: converted_fastq + + script: + """ + # Process all FASTQ files (handles both single-end and paired-end) + for fq in ${fastq}; do + base=\$(basename "\${fq}" .fastq.gz) + base=\$(basename "\${base}" .fq.gz) + + # Decompress, convert A to G in sequence lines (every 4th line starting from line 2), + # then recompress + zcat "\${fq}" | awk 'NR%4==2 {gsub(/A/,"G"); gsub(/a/,"g")} {print}' | gzip > "\${base}_AtoG.fastq.gz" + done + """ +} + +/* + * Transform A to G bases in reference genome FASTA for hyper-editing detection + * Converts all adenosine (A) nucleotides to guanosine (G) in sequence lines (non-header lines) + */ +process transform_bases_fasta { + label 'bash' + tag "${fasta.baseName}" + publishDir("${output_dir}", mode:"copy", pattern: "*_AtoG.fa") + + input: + path fasta + val output_dir + + output: + path "*_AtoG.fa", emit: converted_fasta + + script: + def is_compressed = fasta.name.endsWith('.gz') + def base_name = fasta.baseName.replaceAll(/\.fa(sta)?/, '') + def output_name = "${base_name}_AtoG.fa" + + if (is_compressed) { + """ + # Decompress, convert A to G in non-header lines, save uncompressed + zcat ${fasta} | awk '/^>/ {print; next} {gsub(/A/,"G"); gsub(/a/,"g"); print}' > ${output_name} + """ + } else { + """ + # Convert A to G in non-header lines (lines not starting with >) + awk '/^>/ {print; next} {gsub(/A/,"G"); gsub(/a/,"g"); print}' ${fasta} > ${output_name} + """ + } } \ No newline at end of file diff --git a/modules/pluviometer.nf b/modules/pluviometer.nf index f03f348..8b49691 100644 --- a/modules/pluviometer.nf +++ b/modules/pluviometer.nf @@ -1,6 +1,6 @@ process pluviometer { label "pluviometer" - publishDir("${params.outdir}/feature_edits", mode: "copy") + publishDir("${params.outdir}/pluviometer/${tool_format}", mode: "copy") tag "${meta.id}" input: @@ -9,7 +9,7 @@ process pluviometer { val(tool_format) output: - tuple(val(meta), path("features.tsv"), path("aggregates.tsv"), path("pluviometer.log"), emit: tuple_sample_feature_edits) + tuple(val(meta), path("*features.tsv"), path("*aggregates.tsv"), path("*pluviometer.log"), emit: tuple_sample_feature_edits) script: @@ -22,6 +22,7 @@ process pluviometer { --cov 1 \ --edit_threshold ${params.edit_threshold} \ --threads ${task.cpus} \ - --aggregation_mode ${params.aggregation_mode} + --aggregation_mode ${params.aggregation_mode} \ + --output ${meta.id} """ } \ No newline at end of file diff --git a/modules/python.nf b/modules/python.nf new file mode 100644 index 0000000..2a26b65 --- /dev/null +++ b/modules/python.nf @@ -0,0 +1,27 @@ +/* +Here are described all processes related to rust +*/ + +/* + * Restore original read sequences in BAM files from FASTQ + * Used after alignment with A-to-G converted sequences to restore original bases + */ +process restore_original_sequences { + label "pluviometer" + tag "${bam.baseName}" + publishDir("${output_dir}", mode:"copy", pattern: "*_restored.bam") + + input: + tuple( val(meta), path(bam), path(bam_unmapped)) + val output_dir + + output: + tuple val(meta), path("*_restored.bam"), emit: restored_bam + + script: + def output_name = "${bam.baseName}_restored.bam" + + """ + restore_sequences.py -b ${bam} -u ${bam_unmapped} -o ${output_name} + """ +} \ No newline at end of file diff --git a/modules/samtools.nf b/modules/samtools.nf index a3f9c34..8dbc4e1 100644 --- a/modules/samtools.nf +++ b/modules/samtools.nf @@ -57,4 +57,85 @@ process samtools_sort_bam { samtools sort -@ ${task.cpus} ${bam} -o ${bam.baseName}_sorted.bam fi """ +} + +/* + * Split BAM file into mapped and unmapped reads + */ +process samtools_split_mapped_unmapped { + label "samtools" + tag "${meta.id}" + publishDir("${params.outdir}/split_bam", mode:"copy", pattern: "*_{mapped,unmapped}.bam") + + input: + tuple val(meta), path(bam) + + output: + tuple val(meta), path("*_mapped.bam"), emit: mapped_bam + tuple val(meta), path("*_unmapped.bam"), emit: unmapped_bam + + script: + def base = bam.baseName + // _seqkit suffix is added by aline when starting from fastq, when starting from BAM, there is no _seqkit suffix. + // We want to keep the same base name for both cases, because it is used to recognize the sample name (sample name is what is bofore the first occurrence of _seqkit) + // It is needed at the step of sequence restoration, where we join the aligned BAM with the original unmapped BAM based on the sample name. (see hyper-editing.nf) + def suffix = base.contains('_seqkit') ? '' : '_seqkit' + """ + # Extract mapped reads (SAM flag -F 4: exclude unmapped) + samtools view -@ ${task.cpus} -b -F 4 ${bam} > ${base}${suffix}_mapped.bam + + # Extract unmapped reads (SAM flag -f 4: include only unmapped) + samtools view -@ ${task.cpus} -b -f 4 ${bam} > ${base}${suffix}_unmapped.bam + """ +} + +/* + * Convert BAM to FASTQ format + * Extracts reads from BAM and converts to FASTQ for re-alignment + */ +process convert_to_fastq { + label "samtools" + tag "${meta.id}" + publishDir("${output_dir}", mode:"copy", pattern: "*.fastq*") + + input: + tuple val(meta), path(bam) + val output_dir + + output: + tuple val(meta), path("*.fastq.gz"), emit: fastq_reads + + script: + def output_base = "${bam.baseName}" + """ + # Check if BAM contains paired-end or single-end reads + if samtools view -c -f 1 ${bam} | grep -q "^0\$"; then + # Single-end reads + samtools fastq -@ ${task.cpus} ${bam} | gzip > ${output_base}.fastq.gz + else + # Paired-end reads + samtools fastq -@ ${task.cpus} -1 ${output_base}_R1.fastq.gz -2 ${output_base}_R2.fastq.gz ${bam} + fi + """ +} + +/* + * Merge two BAM files + * Combines aligned reads from two BAM files into a single output BAM + */ +process samtools_merge_bams { + label "samtools" + tag "${meta.id}" + publishDir("${params.outdir}/merged_bam", mode:"copy") + + input: + tuple val(meta), path(bam1), path(bam2) + + output: + tuple val(meta), path("*_merged.bam"), emit: merged_bam + + script: + """ + samtools merge -@ ${task.cpus} ${meta.id}_merged.bam ${bam1} ${bam2} + """ } \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 7fd8985..df88ba9 100644 --- a/nextflow.config +++ b/nextflow.config @@ -60,7 +60,7 @@ profiles { } test { params.aligner = "STAR" - params.reads = "${baseDir}/data/chr21/chr21_small_R1.fastq.gz" + params.reads = "${baseDir}/data/chr21/chr21_small_R1.fastq.gz,${baseDir}/data/chr21/chr21_small_R2.fastq.gz" params.genome = "${baseDir}/data/chr21/chr21_small.fasta.gz" params.annotation = "${baseDir}/data/chr21/chr21_small_filtered.gff3.gz" params.strandedness = "ISR" diff --git a/rain.nf b/rain.nf index 08320ca..fd759c0 100644 --- a/rain.nf +++ b/rain.nf @@ -16,13 +16,15 @@ params.reads = null // "/path/to/reads_{1,2}.fastq.gz/or/folder" params.genome = null // "/path/to/genome.fa" params.annotation = null // "/path/to/annotations.gff3" params.outdir = "rain_result" -params.clipoverlap = false +params.clip_overlap = false +params.clean_duplicate = true // Edit counting params edit_site_tools = ["reditools2", "reditools3", "jacusa2", "sapin"] params.edit_site_tool = "reditools3" params.edit_threshold = 1 params.aggregation_mode = "all" +params.skip_hyper_editing = false // Skip hyper-editing detection // Report params params.multiqc_config = "$baseDir/config/multiqc_config.yaml" // MultiQC config file @@ -32,6 +34,7 @@ params.region = "" // e.g. chr21 - Used to limit the analysis to a specific regi // others params.help = null params.monochrome_logs = false // if true, no color in logs +params.debug = false // Enable debug output // -------------------------------------------------- /* ---- Params shared between RAIN and AliNe ---- */ @@ -100,6 +103,8 @@ def helpMSG() { Input sequences: --annotation Path to the annotation file (GFF or GTF) + --genome Path to the reference genome in FASTA format. + --read_type Type of reads among this list ${read_type_allowed} [no default] --reads path to the reads file, folder or csv. If a folder is provided, all the files with proper extension in the folder will be used. You can provide remote files (commma separated list). file extension expected : <.fastq.gz>, <.fq.gz>, <.fastq>, <.fq> or <.bam>. for paired reads extra <_R1_001> or <_R2_001> is expected where and <_001> are optional. e.g. , , ) @@ -109,20 +114,21 @@ def helpMSG() { sample,fastq_1,fastq_2,strandedness,read_type control1,path/to/data1.fastq.bam,,auto,short_single control2,path/to/data2_R1.fastq.gz,path/to/data2_R2.fastq.gz,auto,short_paired - --genome Path to the reference genome in FASTA format. - --read_type Type of reads among this list ${read_type_allowed} [no default] Output: --output Path to the output directory [default: $params.outdir] Optional input: + --aggregation_mode Mode for aggregating edition counts mapped on genomic features. See documentation for details. Options are: "all" (default) or "cds_longest" --aligner Aligner to use [default: $params.aligner] + --clean_duplicate Remove PCR duplicates from BAM files using GATK MarkDuplicates. [default: $params.clean_duplicate] + --clip_overlap Clip overlapping sequences in read pairs to avoid double counting. [default: $params.clipoverlap] + --debug Enable debug output for troubleshooting. [default: $params.debug] --edit_site_tool Tool used for detecting edited sites. [default: $params.edit_site_tool] - --strandedness Set the strandedness for all your input reads [default: $params.strandedness]. In auto mode salmon will guess the library type for each fastq sample. [ 'U', 'IU', 'MU', 'OU', 'ISF', 'ISR', 'MSF', 'MSR', 'OSF', 'OSR', 'auto' ] --edit_threshold Minimal number of edited reads to count a site as edited [default: $params.edit_threshold] - --aggregation_mode Mode for aggregating edition counts mapped on genomic features. See documentation for details. Options are: "all" (default) or "cds_longest" - --clipoverlap Clip overlapping sequences in read pairs to avoid double counting. [default: $params.clipoverlap] --fastqc run fastqc on main steps [default: $params.fastqc] + --skip_hyper_editing Skip hyper-editing detection step for unmapped reads. [default: $params.skip_hyper_editing] + --strandedness Set the strandedness for all your input reads [default: $params.strandedness]. In auto mode salmon will guess the library type for each fastq sample. [ 'U', 'IU', 'MU', 'OU', 'ISF', 'ISR', 'MSF', 'MSR', 'OSF', 'OSR', 'auto' ] Nextflow options: -profile Change the profile of nextflow both the engine and executor more details on github README [debug, test, itrop, singularity, local, docker] @@ -139,10 +145,13 @@ General Parameters genome : ${params.genome} strandedness : ${params.strandedness} read_type : ${params.read_type} + hyper-editing : ${params.skip_hyper_editing ? "skipped" : "performed"} + clip_overlap : ${params.clip_overlap} + clean_duplicate : ${params.clean_duplicate} outdir : ${params.outdir} Alignment Parameters - aline_profiles : ${params.aline_profiles} + aline_profiles : ${params.aline_profiles} aligner : ${params.aligner} @@ -169,13 +178,14 @@ include {fastqc as fastqc_ali; fastqc as fastqc_dup; fastqc as fastqc_clip} from include {gatk_markduplicates } from './modules/gatk.nf' include {multiqc} from './modules/multiqc.nf' include {fasta_unzip} from "$baseDir/modules/pigz.nf" -include {samtools_index; samtools_fasta_index; samtools_sort_bam} from './modules/samtools.nf' +include {samtools_index; samtools_fasta_index; samtools_sort_bam as samtools_sort_bam_raw; samtools_sort_bam as samtools_sort_bam_merged; samtools_split_mapped_unmapped; samtools_merge_bams} from './modules/samtools.nf' include {reditools2} from "./modules/reditools2.nf" include {reditools3} from "./modules/reditools3.nf" include {jacusa2} from "./modules/jacusa2.nf" include {sapin} from "./modules/sapin.nf" include {normalize_gxf} from "./modules/agat.nf" include {pluviometer as pluviometer_jacusa2; pluviometer as pluviometer_reditools2; pluviometer as pluviometer_reditools3; pluviometer as pluviometer_sapin} from "./modules/pluviometer.nf" +include {HYPER_EDITING} from "./subworkflows/hyper-editing.nf" //************************************************* // STEP 3 - Deal with parameters @@ -397,12 +407,9 @@ workflow { } } - // sort the bam files - Channel.empty().set{sorted_bam} - sorted_bam = samtools_sort_bam( bams ) // stat on aligned reads if(params.fastqc){ - fastqc_ali(sorted_bam, "ali") + fastqc_ali(bams, "ali") logs.concat(fastqc_ali.out).set{logs} // save log } @@ -475,7 +482,7 @@ workflow { 'Juke34/AliNe -r v1.6.0', // Select pipeline "${workflow.resume?'-resume':''} -profile ${aline_profile}", // workflow opts supplied as params for flexibility "-config ${params.aline_profiles}", - "--reads ${aline_data_in}", + aline_data_in, genome, "--read_type ${params.read_type}", "--aligner ${params.aligner}", @@ -571,49 +578,110 @@ workflow { } // MERGE ALINE BAM AND INPUT BAM TOGETHER - tuple_sample_sortedbam = aline_alignments_all.mix(sorted_bam) - log.info "The following bam file(s) will be processed by RAIN:" - tuple_sample_sortedbam.view() + all_input_bam = aline_alignments_all.mix(bams) + if (params.debug) { + all_input_bam.view { meta, bam -> log.info "RAIN processing ${bam} (sample: ${meta.id})" } + } +// ------------------------------------------------------- +// ----------------- DETECT HYPER_EDITING ---------------- +// ------------------------------------------------------- + + // Split BAM into mapped and unmapped reads + samtools_split_mapped_unmapped(all_input_bam) + + // Process hyper-editing if not skipped + Channel.empty().set{all_bam} + if (!params.skip_hyper_editing) { + + HYPER_EDITING( + samtools_split_mapped_unmapped.out.unmapped_bam, + genome, + aline_profile, + clean_annotation, + 30, // quality threshold + "${params.outdir}/hyper_editing", + ) + + // Access the results + hyperedit_bam_mapped = HYPER_EDITING.out.bam_mapped + bam_unmapped = HYPER_EDITING.out.bam_unmapped + + + // create a channel of tuples with (meta, bam, bam_he) joined by the id + samtools_split_mapped_unmapped.out.mapped_bam.map { meta, bam -> tuple(meta.id, meta, bam) } + .join( + hyperedit_bam_mapped.map { meta2, bam_he -> tuple(meta2.id, meta2, bam_he) } + ) + .map { id, meta, bam, meta2, bam_he -> tuple(meta, bam, bam_he) } + .set { meta_bam_bamhe } + + // Merge the final bam with the original bam in case of hyper-editing to keep all reads for edition site detection + merged_bams = samtools_merge_bams(meta_bam_bamhe) + + // Add hyper-editing sample to analysis + // Here we have bam containing normal and hyper_editing reads and bam containing only hyper-editing reads. + // The bam with only hyper-editing reads will be used to count hyper-editing sites specifically. + all_bam = merged_bams.mix(hyperedit_bam_mapped) + + } else { + all_bam = samtools_split_mapped_unmapped.out.mapped_bam + } + + // Sort the bam files + all_bam_sorted = samtools_sort_bam_merged(all_bam) // remove duplicates - gatk_markduplicates(tuple_sample_sortedbam) - logs.concat(gatk_markduplicates.out.log).set{logs} // save log - // stat on bam without duplicates - if(params.fastqc){ - fastqc_dup(gatk_markduplicates.out.tuple_sample_dedupbam, "dup") - logs.concat(fastqc_dup.out).set{logs} // save log + if(params.clean_duplicate){ + gatk_markduplicates(all_bam_sorted) + all_bam_sorted_dedup = gatk_markduplicates.out.tuple_sample_dedupbam + logs.concat(gatk_markduplicates.out.log).set{logs} // save log + // stat on bam without duplicates + if(params.fastqc){ + fastqc_dup(all_bam_sorted_dedup, "dup") + logs.concat(fastqc_dup.out).set{logs} // save log + } + } + else { + all_bam_sorted_dedup = all_bam_sorted } + // Clip overlap - if (params.clipoverlap) { + if (params.clip_overlap) { bamutil_clipoverlap(gatk_markduplicates.out.tuple_sample_dedupbam) - tuple_sample_bam_processed = bamutil_clipoverlap.out.tuple_sample_clipoverbam + all_bam_sorted_dedup_clip = bamutil_clipoverlap.out.tuple_sample_clipoverbam // stat on bam with overlap clipped if(params.fastqc){ - fastqc_clip(tuple_sample_bam_processed, "clip") + fastqc_clip(all_bam_sorted_dedup_clip, "clip") logs.concat(fastqc_clip.out).set{logs} // save log } } else { - tuple_sample_bam_processed = gatk_markduplicates.out.tuple_sample_dedupbam + all_bam_sorted_dedup_clip = gatk_markduplicates.out.tuple_sample_dedupbam } - // index bam - samtools_index(tuple_sample_bam_processed) + + // index mapped bam + samtools_index(all_bam_sorted_dedup_clip) + final_bam_for_editing = samtools_index.out.tuple_sample_bam_bamindex + +// ------------------------------------------------------- +// ----------------- DETECT EDITING SITES ---------------- +// ------------------------------------------------------- // Select site detection tool if ( "jacusa2" in edit_site_tool_list ){ // Create a fasta index file of the reference genome samtools_fasta_index(genome.collect()) - jacusa2(samtools_index.out.tuple_sample_bam_bamindex, samtools_fasta_index.out.tuple_fasta_fastaindex.collect()) + jacusa2(final_bam_for_editing, samtools_fasta_index.out.tuple_fasta_fastaindex.collect()) pluviometer_jacusa2(jacusa2.out.tuple_sample_jacusa2_table, clean_annotation.collect(), "jacusa2") } if ( "sapin" in edit_site_tool_list ){ sapin(tuple_sample_bam_processed, genome.collect()) } if ( "reditools2" in edit_site_tool_list ){ - reditools2(samtools_index.out.tuple_sample_bam_bamindex, genome.collect(), params.region) + reditools2(final_bam_for_editing, genome.collect(), params.region) pluviometer_reditools2(reditools2.out.tuple_sample_serial_table, clean_annotation.collect(), "reditools2") } if ( "reditools3" in edit_site_tool_list ){ - reditools3(samtools_index.out.tuple_sample_bam_bamindex, genome.collect()) + reditools3(final_bam_for_editing, genome.collect()) pluviometer_reditools3(reditools3.out.tuple_sample_serial_table, clean_annotation.collect(), "reditools3") } diff --git a/subworkflows/hyper-editing.nf b/subworkflows/hyper-editing.nf new file mode 100644 index 0000000..95fad1c --- /dev/null +++ b/subworkflows/hyper-editing.nf @@ -0,0 +1,117 @@ +/* + * RAIN Subworkflow: Hyper-editing detection + * + * Detects extensive RNA editing events through A-to-G conversion strategy + * This approach captures heavily edited reads that may escape standard detection + */ + +include { AliNe as ALIGNMENT } from "${baseDir}/modules/aline.nf" +include { convert_to_fastq; samtools_fasta_index; samtools_split_mapped_unmapped } from "${baseDir}/modules/samtools.nf" +include { transform_bases_fastq; transform_bases_fasta } from "${baseDir}/modules/bash.nf" +include { multiqc } from "${baseDir}/modules/multiqc.nf" +include { restore_original_sequences } from "${baseDir}/modules/python.nf" + +/** + * Hyper-editing discovery workflow + * + * Strategy: + * 1. Extract unmapped reads from primary alignment + * 2. Convert adenosines to guanosines in reads and reference + * 3. Re-align with converted sequences + * 4. Restore original bases for downstream analysis + * + */ +workflow HYPER_EDITING { + + take: + unmapped_bams // Unmapped read chunks from primary alignment + genome // Genomic reference sequence + aline_profile // AliNe profile in coma-separated format + clean_annotation // Annotation file for AliNe + quality_threshold // Quality score filter threshold + output_he // output directory path ier + + main: + // For now, create an empty channel until processes are implemented + // This prevents the workflow from failing while we build out the functionality + Channel.empty().set { filtered_bams } + + // Stage 1: Convert unmapped BAM to FASTQ format + extracted_reads = convert_to_fastq(unmapped_bams, output_he) + + // Stage 2: Apply A-to-G nucleotide transformation to reads + converted_reads = transform_bases_fastq(extracted_reads, output_he) + + // Stage 3: Generate A-to-G converted reference genome + converted_reference = transform_bases_fasta(genome, output_he) + + // Stage 4: Collect converted reads paths for AliNe + converted_reads + .map { meta, fastq -> fastq } + .collect() + .map { files -> + // Create a comma-separated list or single file path + files.collect { it.toString() }.join(',') + } + .set { reads_path } + + // Stage 5: Build alignment index for converted reference + alignment_index = samtools_fasta_index(converted_reference) + + // Stage 6: Perform alignment with converted sequences + if (params.debug) { + reads_path.view { paths -> log.info " AliNe feeded with FASTQ: ${paths.split(',').collect { it.split('/').last() }.join(', ')}" } + } + + ALIGNMENT ( + 'Juke34/AliNe -r v1.6.0', // Select pipeline + "${workflow.resume?'-resume':''} -profile ${aline_profile}", // workflow opts supplied as params for flexibility + "-config ${params.aline_profiles}", + reads_path, + converted_reference, + "--read_type ${params.read_type}", + "--aligner ${params.aligner}", + "--strandedness ${params.strandedness}", + clean_annotation, + workflow.workDir.resolve('Juke34/AliNe').toUriString() + ) + + // GET TUPLE [ID, BAM] FILES + ALIGNMENT.out.output + .map { dir -> + files("$dir/alignment/*/*.bam", checkIfExists: true) // Find BAM files inside the output directory + } + .flatten() // Ensure we emit each file separately + .map { bam -> + def basename = bam.getName() + def name = bam.getName().split('_seqkit')[0] // Extract the base name of the BAM file. _seqkit is the separator. + + tuple(name, bam) + } // Convert each BAM file into a tuple, with the base name as the first element + .set { aligned_bams } // Store the channel + if (params.debug) { + aligned_bams.view { id, bam -> log.info " AliNe HE produced BAM: ${bam.getName()} (sample: ${id})" } + } + // Stage 7: Merge aligned BAMs with original unmapped BAMs for sequence restoration + // create a channel of tuples with (meta, bam, fastq) joined by the id:chr21_small_R1 and sample name + aligned_bams.map { id, bam -> tuple(id, bam) } + .join( + unmapped_bams.map { meta2, bam_ori -> tuple(meta2.id, meta2, bam_ori) } + ) + .map { id, bam, meta2, bam_ori -> tuple(meta2, bam, bam_ori) } + .set { sample_bam_bamori } + + // Stage 8: Reconstruct original sequences from converted alignments + if (params.debug) { + sample_bam_bamori.view { meta, bam, bam_ori -> log.info " Restoring sequences for BAM: ${bam.getName()} (sample: ${meta.id}) using original BAM: ${bam_ori.getName()}" } + unmapped_bams.view { meta, bam -> log.info " Original unmapped BAM for restoration: ${bam.getName()} (sample: ${meta.id})" } + } + reconstructed_bams = restore_original_sequences(sample_bam_bamori, output_he) + + // Stage 9: Split BAM into mapped and unmapped reads + samtools_split_mapped_unmapped(reconstructed_bams) + + emit: + bam_unmapped = samtools_split_mapped_unmapped.out.unmapped_bam + bam_mapped = samtools_split_mapped_unmapped.out.mapped_bam +}