From 71fb269c29a910d58a6065697b67938dfb7bdbcd Mon Sep 17 00:00:00 2001 From: earx Date: Tue, 24 Jun 2025 10:59:20 +0200 Subject: [PATCH 1/5] fix strandedness input --- modules/jacusa2.nf | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/modules/jacusa2.nf b/modules/jacusa2.nf index 60f6a87..ec658c9 100644 --- a/modules/jacusa2.nf +++ b/modules/jacusa2.nf @@ -25,9 +25,7 @@ process jacusa2 { // Unstranded jacusa_strand_param = "UNSTRANDED" } else { - // Unsupported: Pass the library type string so that it's reported in - // the Jacusa2 error message - jacusa_strand_param = meta.libtype + jacusa_strand_param = "UNSTRANDED" } """ java -Xmx${task.memory.toMega()}M -jar /usr/local/bin/JACUSA_v2.0.4.jar \ From 8bfbf8a280dd8a40f80ddc93d4453e030f9ddb73 Mon Sep 17 00:00:00 2001 From: earx Date: Tue, 24 Jun 2025 11:17:45 +0200 Subject: [PATCH 2/5] set output format to extended BED6 and add comments explaining other options --- modules/jacusa2.nf | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/modules/jacusa2.nf b/modules/jacusa2.nf index ec658c9..58c1e05 100644 --- a/modules/jacusa2.nf +++ b/modules/jacusa2.nf @@ -28,10 +28,16 @@ process jacusa2 { jacusa_strand_param = "UNSTRANDED" } """ + # -A Show all sites, inc. sites without variants + # -f Use BED-6 extended output format + # -p Number of threads + # -r Path of output file + # -c Minimum coverage: 1 read + # -s Store feature-filtered results in another file java -Xmx${task.memory.toMega()}M -jar /usr/local/bin/JACUSA_v2.0.4.jar \ call-1 \ -A \ - -f V \ + -f B \ -p ${task.cpus} \ -r ${base_name}.site_edits_jacusa2.txt \ -c 1 \ From e72470906c4ce8e1bb258c63a1f07f5c98bde528 Mon Sep 17 00:00:00 2001 From: earx Date: Tue, 24 Jun 2025 11:24:07 +0200 Subject: [PATCH 3/5] Add usage note to Dockerfile --- docker/jacusa2/Dockerfile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docker/jacusa2/Dockerfile b/docker/jacusa2/Dockerfile index 208e659..5d431f9 100644 --- a/docker/jacusa2/Dockerfile +++ b/docker/jacusa2/Dockerfile @@ -7,3 +7,6 @@ RUN wget https://github.com/dieterich-lab/JACUSA2/releases/download/v2.0.4/JACUS RUN echo "#!/usr/bin/env bash" > /usr/local/bin/jacusa2 RUN echo "java -jar /usr/local/bin/JACUSA_v2.0.4.jar $@" >> /usr/local/bin/jacusa2 RUN chmod +x /usr/local/bin/jacusa2 + +# ⚠️ IMPORTANT +# Run the Docker image with the option `--ulimit nofile=1024:1024` From d40714efa5e1c8c2c658c2df704a37e43d28496e Mon Sep 17 00:00:00 2001 From: earx Date: Tue, 24 Jun 2025 12:14:02 +0200 Subject: [PATCH 4/5] add Jacusa2 reader. new field `score` in SiteVariantData --- bin/stats/site_variant_readers.py | 62 +++++++++++++++++++++++++++++-- bin/stats/utils.py | 1 + 2 files changed, 60 insertions(+), 3 deletions(-) diff --git a/bin/stats/site_variant_readers.py b/bin/stats/site_variant_readers.py index 896c2a6..47e95cd 100644 --- a/bin/stats/site_variant_readers.py +++ b/bin/stats/site_variant_readers.py @@ -7,6 +7,22 @@ REDITOOLS_FIELDS = ["Seqid", "Position", "Reference", "Strand", "Coverage", "MeanQ", "Frequencies"] REDITOOLS_FIELD_INDEX = {field: i for i, field in enumerate(REDITOOLS_FIELDS)} +# Jacusa output file fields in the extended BED6 format +JACUSA_FIELDS = ["contig", "start", "end", "name", "score", "strand", "bases11", "info", "filter", "ref"] +JACUSA_FIELDS_INDEX = {field: i for i, field in enumerate(JACUSA_FIELDS)} + +def skip_comments(handle: TextIO, s: str) -> Optional[str]: + """ + Read and return the next line in a text file handle that does not start with the comment prefix `s`. + """ + line: str = handle.readline() + + while line.startswith(s): + line = handle.readline() + + return line + + class RNAVariantReader(ABC): """Abstract class defining the API for readers""" @@ -48,7 +64,8 @@ def read(self) -> Optional[SiteVariantData]: strand=self.strand, coverage=1, mean_quality=30.0, - frequencies=self.frequencies + frequencies=self.frequencies, + score=0.0 ) self.position += 1 @@ -128,7 +145,8 @@ def _parse_parts(self) -> SiteVariantData: strand=strand, coverage=int(self.parts[REDITOOLS_FIELD_INDEX["Coverage"]]), mean_quality=float(self.parts[REDITOOLS_FIELD_INDEX["MeanQ"]]), - frequencies=np.int32(self.parts[REDITOOLS_FIELD_INDEX["Frequencies"]][1:-1].split(",") + [0]) + frequencies=np.int32(self.parts[REDITOOLS_FIELD_INDEX["Frequencies"]][1:-1].split(",") + [0]), + score=0.0 ) def read(self) -> Optional[SiteVariantData]: @@ -147,7 +165,6 @@ def close(self) -> None: """Close the file""" self.file_handle.close() - class Reditools2Reader(ReditoolsXReader): def parse_strand(self) -> int: strand = int(self.parts[REDITOOLS_FIELD_INDEX["Strand"]]) @@ -174,3 +191,42 @@ def parse_strand(self) -> int: case _: raise Exception(f"Invalid strand value: {strand_str}") +class Jacusa2Reader(): + def __init__(self, file_handle: TextIO) -> None: + self.file_handle: TextIO = file_handle + + line = skip_comments(self.file_handle, "##") + + # Check the Jacusa header + assert line.lstrip('#').split('\t') == JACUSA_FIELDS + + return None + + def read(self): + line: str = self.file_handle.readline() + parts: list[str] = line.split('\t') + + reference_nuc_str: str = parts[JACUSA_FIELDS_INDEX["ref"]] + + strand_str: str = parts[JACUSA_FIELDS_INDEX["strand"]] + + match strand_str: + case '.': + strand = 0 + case '+': + strand = 1 + case '-': + strand -1 + + frequencies=np.int32(parts[JACUSA_FIELDS_INDEX["bases11"]].split(',') + [0]) + + return SiteVariantData( + seqid=parts[JACUSA_FIELDS_INDEX["contig"]], + position=int(parts[JACUSA_FIELDS_INDEX["start"]]), + reference=NUC_STR_TO_IND[reference_nuc_str], + strand=strand, + coverage=sum(frequencies), + mean_quality=float("nan"), + frequencies=frequencies, + score=float(parts[JACUSA_FIELDS_INDEX["score"]]) + ) diff --git a/bin/stats/utils.py b/bin/stats/utils.py index 33d7012..5b6cc75 100644 --- a/bin/stats/utils.py +++ b/bin/stats/utils.py @@ -49,3 +49,4 @@ class SiteVariantData: coverage: int mean_quality: float frequencies: NDArray[np.int32] + score: float From 885e4a92b4ef3852ad7a136098d4b45cf4fe9272 Mon Sep 17 00:00:00 2001 From: earx Date: Tue, 24 Jun 2025 13:35:24 +0200 Subject: [PATCH 5/5] make pluviometer take jacusa2 input --- bin/stats/pluviometer.py | 5 ++++- bin/stats/site_variant_readers.py | 12 ++++++++---- modules/jacusa2.nf | 6 +++--- rain.nf | 2 ++ 4 files changed, 17 insertions(+), 8 deletions(-) diff --git a/bin/stats/pluviometer.py b/bin/stats/pluviometer.py index 3f725e1..976472a 100644 --- a/bin/stats/pluviometer.py +++ b/bin/stats/pluviometer.py @@ -13,7 +13,7 @@ import numpy as np from numpy.typing import NDArray from typing import TextIO, NamedTuple, Optional, Generator -from site_variant_readers import RNAVariantReader, Reditools2Reader, Reditools3Reader +from site_variant_readers import RNAVariantReader, Reditools2Reader, Reditools3Reader, Jacusa2Reader import argparse from contextlib import nullcontext import sys @@ -520,6 +520,9 @@ def scan_and_count(self, reader: RNAVariantReader) -> None: case "reditools3": print("Reditools3 format\n") sv_reader: RNAVariantReader = Reditools3Reader(sv_handle) + case "jacusa2": + print("Jacusa2 BED6 extended format") + sv_reader: RNAVariantReader = Jacusa2Reader(sv_handle) case _: raise Exception(f'Unimplemented format "{args.format}"') diff --git a/bin/stats/site_variant_readers.py b/bin/stats/site_variant_readers.py index 47e95cd..f352a59 100644 --- a/bin/stats/site_variant_readers.py +++ b/bin/stats/site_variant_readers.py @@ -198,12 +198,16 @@ def __init__(self, file_handle: TextIO) -> None: line = skip_comments(self.file_handle, "##") # Check the Jacusa header - assert line.lstrip('#').split('\t') == JACUSA_FIELDS + assert line.strip().lstrip('#').split('\t') == JACUSA_FIELDS return None - def read(self): - line: str = self.file_handle.readline() + def read(self) -> Optional[SiteVariantData]: + line: str = self.file_handle.readline().strip() + + if line == "": + return None + parts: list[str] = line.split('\t') reference_nuc_str: str = parts[JACUSA_FIELDS_INDEX["ref"]] @@ -222,7 +226,7 @@ def read(self): return SiteVariantData( seqid=parts[JACUSA_FIELDS_INDEX["contig"]], - position=int(parts[JACUSA_FIELDS_INDEX["start"]]), + position=int(parts[JACUSA_FIELDS_INDEX["start"]]), # Jacusa2 position is 0-based reference=NUC_STR_TO_IND[reference_nuc_str], strand=strand, coverage=sum(frequencies), diff --git a/modules/jacusa2.nf b/modules/jacusa2.nf index 58c1e05..20332a6 100644 --- a/modules/jacusa2.nf +++ b/modules/jacusa2.nf @@ -15,13 +15,13 @@ process jacusa2 { base_name = bam.BaseName // Set the strand orientation parameter from the library type parameter // Terms explained in https://salmon.readthedocs.io/en/latest/library_type.html - if (meta.libtype in ["ISR", "SR"]) { + if (meta.strandedness in ["ISR", "SR"]) { // First-strand oriented jacusa_strand_param = "FR_SECONDSTRAND" - } else if (meta.libtype in ["ISF", "SF"]) { + } else if (meta.strandedness in ["ISF", "SF"]) { // Second-strand oriented jacusa_strand_param = "RF_FIRSTSTRAND" - } else if (meta.libtype in ["IU", "U"]) { + } else if (meta.strandedness in ["IU", "U"]) { // Unstranded jacusa_strand_param = "UNSTRANDED" } else { diff --git a/rain.nf b/rain.nf index 278b928..b55788f 100644 --- a/rain.nf +++ b/rain.nf @@ -591,6 +591,8 @@ workflow { // Create a fasta index file of the reference genome samtools_fasta_index(genome.collect()) jacusa2(samtools_index.out.tuple_sample_bam_bamindex, samtools_fasta_index.out.tuple_fasta_fastaindex.collect()) + normalize_gxf(annotation.collect()) + pluviometer(jacusa2.out.tuple_sample_jacusa2_table, normalize_gxf.out.gff.collect(), "jacusa2") break case "sapin": sapin(tuple_sample_bam_processed, genome.collect())