Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions bin/restore_sequences.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/usr/bin/env python3
"""
Restore original sequences in BAM files from original unmapped BAM.
This tool replaces modified sequences with their original counterparts.
"""

import sys
import argparse
import pysam


def fix_bam(bam_aligned, bam_unmapped, bam_out):
"""Restore original sequences using indexed lookup."""
# Create index for fast read name lookup
unmapped = pysam.AlignmentFile(bam_unmapped, "rb")
unmapped_index = pysam.IndexedReads(unmapped)
unmapped_index.build()

with pysam.AlignmentFile(bam_aligned, "rb") as bamfile:
with pysam.Samfile(bam_out, 'wb', template=bamfile) as out:
for read in bamfile:
try:
# Fetch original read by name (returns iterator)
for orig_read in unmapped_index.find(read.query_name):
read.query_sequence = orig_read.query_sequence
read.query_qualities = orig_read.query_qualities
read.set_tag("MD", None)
break # Take first match
out.write(read)
except KeyError:
pass # Read not found in unmapped BAM

unmapped.close()


def main():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("-b", "--bam", required=True, help="aligned BAM file")
parser.add_argument("-u", "--unmapped", required=True, help="original unmapped BAM file")
parser.add_argument("-o", "--output", required=True, help="output BAM file")
args = parser.parse_args()

fix_bam(args.bam, args.unmapped, args.output)


if __name__ == "__main__":
sys.exit(main())
4 changes: 2 additions & 2 deletions build_containers.sh
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ if [ "$build_docker" = true ]; then
# List of image names
image_list=( )

for dir in docker/*; do
for dir in containers/docker/*; do
cd "${dir}"
imgname=$(echo $dir | rev | cut -d/ -f1 | rev)
image_list+=(${imgname})
Expand Down Expand Up @@ -161,7 +161,7 @@ if [ "$build_singularity" = true ]; then
sif_list=( )

# Loop through all .def files in singularity/ subdirectories
for def_file in singularity/*/*.def; do
for def_file in containers/singularity/*/*.def; do
# Extract image name from directory
imgname=$(basename "$(dirname "$def_file")")
sif_output="sif_images/${imgname}.sif"
Expand Down
2 changes: 1 addition & 1 deletion config/softwares.config
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ process {
container = 'quay.io/biocontainers/star:2.7.10b--h6b7c446_1'
}
withLabel: 'samtools' {
container = 'quay.io/biocontainers/samtools:1.3.1--h0cf4675_11'
container = 'quay.io/biocontainers/samtools:1.23--h96c455f_0'
}
withLabel: "sapin" {
container = singularity.enabled ? "${params.sifPath}/sapin.sif" : "sapin"
Expand Down
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ dependencies:
- typing_extensions=4.13.2=pyh29332c3_0
- tzdata=2025b=h78e105d_0
- wheel=0.45.1=pyhd8ed1ab_1
- pysam=0.23.3=py312h47d5410_1
- pip:
- flameprof==0.4
prefix: /home/earx/miniforge3/envs/rain-stats-scripts
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ dependencies:
- typing_extensions=4.13.2=pyh29332c3_0
- tzdata=2025b=h78e105d_0
- wheel=0.45.1=pyhd8ed1ab_1
- pysam=0.23.3=py312h47d5410_1
- pip:
- flameprof==0.4
prefix: /home/earx/miniforge3/envs/rain-stats-scripts
File renamed without changes.
File renamed without changes.
2 changes: 1 addition & 1 deletion modules/aline.nf
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ process AliNe {
pipeline_name,
profile,
config,
reads,
"--reads ${reads}",
"--reference ${genome}",
read_type,
aligner,
Expand Down
65 changes: 65 additions & 0 deletions modules/bash.nf
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,69 @@ process extract_libtype {
LIBTYPE=\$(grep expected_format ${samlmon_json} | awk '{print \$2}' | tr -d '",\n')
"""

}

/*
* Transform A to G bases in FASTQ reads for hyper-editing detection
* Converts all adenosine (A) nucleotides to guanosine (G) in sequence lines
* Handles both single-end and paired-end reads
*/
process transform_bases_fastq {
label 'bash'
tag "${meta.id}"
publishDir("${output_dir}", mode:"copy", pattern: "*_AtoG.fastq.gz")

input:
tuple val(meta), path(fastq) // Can be single file or multiple files (R1, R2, singles)
val output_dir

output:
tuple val(meta), path("*_AtoG.fastq.gz"), emit: converted_fastq

script:
"""
# Process all FASTQ files (handles both single-end and paired-end)
for fq in ${fastq}; do
base=\$(basename "\${fq}" .fastq.gz)
base=\$(basename "\${base}" .fq.gz)

# Decompress, convert A to G in sequence lines (every 4th line starting from line 2),
# then recompress
zcat "\${fq}" | awk 'NR%4==2 {gsub(/A/,"G"); gsub(/a/,"g")} {print}' | gzip > "\${base}_AtoG.fastq.gz"
done
"""
}

/*
* Transform A to G bases in reference genome FASTA for hyper-editing detection
* Converts all adenosine (A) nucleotides to guanosine (G) in sequence lines (non-header lines)
*/
process transform_bases_fasta {
label 'bash'
tag "${fasta.baseName}"
publishDir("${output_dir}", mode:"copy", pattern: "*_AtoG.fa")

input:
path fasta
val output_dir

output:
path "*_AtoG.fa", emit: converted_fasta

script:
def is_compressed = fasta.name.endsWith('.gz')
def base_name = fasta.baseName.replaceAll(/\.fa(sta)?/, '')
def output_name = "${base_name}_AtoG.fa"

if (is_compressed) {
"""
# Decompress, convert A to G in non-header lines, save uncompressed
zcat ${fasta} | awk '/^>/ {print; next} {gsub(/A/,"G"); gsub(/a/,"g"); print}' > ${output_name}
"""
} else {
"""
# Convert A to G in non-header lines (lines not starting with >)
awk '/^>/ {print; next} {gsub(/A/,"G"); gsub(/a/,"g"); print}' ${fasta} > ${output_name}
"""
}
}
7 changes: 4 additions & 3 deletions modules/pluviometer.nf
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
process pluviometer {
label "pluviometer"
publishDir("${params.outdir}/feature_edits", mode: "copy")
publishDir("${params.outdir}/pluviometer/${tool_format}", mode: "copy")
tag "${meta.id}"

input:
Expand All @@ -9,7 +9,7 @@ process pluviometer {
val(tool_format)

output:
tuple(val(meta), path("features.tsv"), path("aggregates.tsv"), path("pluviometer.log"), emit: tuple_sample_feature_edits)
tuple(val(meta), path("*features.tsv"), path("*aggregates.tsv"), path("*pluviometer.log"), emit: tuple_sample_feature_edits)


script:
Expand All @@ -22,6 +22,7 @@ process pluviometer {
--cov 1 \
--edit_threshold ${params.edit_threshold} \
--threads ${task.cpus} \
--aggregation_mode ${params.aggregation_mode}
--aggregation_mode ${params.aggregation_mode} \
--output ${meta.id}
"""
}
27 changes: 27 additions & 0 deletions modules/python.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
/*
Here are described all processes related to rust
*/

/*
* Restore original read sequences in BAM files from FASTQ
* Used after alignment with A-to-G converted sequences to restore original bases
*/
process restore_original_sequences {
label "pluviometer"
tag "${bam.baseName}"
publishDir("${output_dir}", mode:"copy", pattern: "*_restored.bam")

input:
tuple( val(meta), path(bam), path(bam_unmapped))
val output_dir

output:
tuple val(meta), path("*_restored.bam"), emit: restored_bam

script:
def output_name = "${bam.baseName}_restored.bam"

"""
restore_sequences.py -b ${bam} -u ${bam_unmapped} -o ${output_name}
"""
}
81 changes: 81 additions & 0 deletions modules/samtools.nf
Original file line number Diff line number Diff line change
Expand Up @@ -57,4 +57,85 @@ process samtools_sort_bam {
samtools sort -@ ${task.cpus} ${bam} -o ${bam.baseName}_sorted.bam
fi
"""
}

/*
* Split BAM file into mapped and unmapped reads
*/
process samtools_split_mapped_unmapped {
label "samtools"
tag "${meta.id}"
publishDir("${params.outdir}/split_bam", mode:"copy", pattern: "*_{mapped,unmapped}.bam")

input:
tuple val(meta), path(bam)

output:
tuple val(meta), path("*_mapped.bam"), emit: mapped_bam
tuple val(meta), path("*_unmapped.bam"), emit: unmapped_bam

script:
def base = bam.baseName
// _seqkit suffix is added by aline when starting from fastq, when starting from BAM, there is no _seqkit suffix.
// We want to keep the same base name for both cases, because it is used to recognize the sample name (sample name is what is bofore the first occurrence of _seqkit)
// It is needed at the step of sequence restoration, where we join the aligned BAM with the original unmapped BAM based on the sample name. (see hyper-editing.nf)
def suffix = base.contains('_seqkit') ? '' : '_seqkit'
"""
# Extract mapped reads (SAM flag -F 4: exclude unmapped)
samtools view -@ ${task.cpus} -b -F 4 ${bam} > ${base}${suffix}_mapped.bam

# Extract unmapped reads (SAM flag -f 4: include only unmapped)
samtools view -@ ${task.cpus} -b -f 4 ${bam} > ${base}${suffix}_unmapped.bam
"""
}

/*
* Convert BAM to FASTQ format
* Extracts reads from BAM and converts to FASTQ for re-alignment
*/
process convert_to_fastq {
label "samtools"
tag "${meta.id}"
publishDir("${output_dir}", mode:"copy", pattern: "*.fastq*")

input:
tuple val(meta), path(bam)
val output_dir

output:
tuple val(meta), path("*.fastq.gz"), emit: fastq_reads

script:
def output_base = "${bam.baseName}"
"""
# Check if BAM contains paired-end or single-end reads
if samtools view -c -f 1 ${bam} | grep -q "^0\$"; then
# Single-end reads
samtools fastq -@ ${task.cpus} ${bam} | gzip > ${output_base}.fastq.gz
else
# Paired-end reads
samtools fastq -@ ${task.cpus} -1 ${output_base}_R1.fastq.gz -2 ${output_base}_R2.fastq.gz ${bam}
fi
"""
}

/*
* Merge two BAM files
* Combines aligned reads from two BAM files into a single output BAM
*/
process samtools_merge_bams {
label "samtools"
tag "${meta.id}"
publishDir("${params.outdir}/merged_bam", mode:"copy")

input:
tuple val(meta), path(bam1), path(bam2)

output:
tuple val(meta), path("*_merged.bam"), emit: merged_bam

script:
"""
samtools merge -@ ${task.cpus} ${meta.id}_merged.bam ${bam1} ${bam2}
"""
}
2 changes: 1 addition & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ profiles {
}
test {
params.aligner = "STAR"
params.reads = "${baseDir}/data/chr21/chr21_small_R1.fastq.gz"
params.reads = "${baseDir}/data/chr21/chr21_small_R1.fastq.gz,${baseDir}/data/chr21/chr21_small_R2.fastq.gz"
params.genome = "${baseDir}/data/chr21/chr21_small.fasta.gz"
params.annotation = "${baseDir}/data/chr21/chr21_small_filtered.gff3.gz"
params.strandedness = "ISR"
Expand Down
Loading