Merge pull request #41 from Juke34/jacusa2

eascarrunz · web-flow · commit 0024d5316ba5 · 2025-06-24T13:57:20.000+02:00
Jacusa2
diff --git a/bin/stats/pluviometer.py b/bin/stats/pluviometer.py
@@ -13,7 +13,7 @@
 import numpy as np
 from numpy.typing import NDArray
 from typing import TextIO, NamedTuple, Optional, Generator
-from site_variant_readers import RNAVariantReader, Reditools2Reader, Reditools3Reader
+from site_variant_readers import RNAVariantReader, Reditools2Reader, Reditools3Reader, Jacusa2Reader
 import argparse
 from contextlib import nullcontext
 import sys
@@ -520,6 +520,9 @@ def scan_and_count(self, reader: RNAVariantReader) -> None:
             case "reditools3":
                 print("Reditools3 format\n")
                 sv_reader: RNAVariantReader = Reditools3Reader(sv_handle)
+            case "jacusa2":
+                print("Jacusa2 BED6 extended format")
+                sv_reader: RNAVariantReader = Jacusa2Reader(sv_handle)
             case _:
                 raise Exception(f'Unimplemented format "{args.format}"')
 
diff --git a/bin/stats/site_variant_readers.py b/bin/stats/site_variant_readers.py
@@ -7,6 +7,22 @@
 REDITOOLS_FIELDS = ["Seqid", "Position", "Reference", "Strand", "Coverage", "MeanQ", "Frequencies"]
 REDITOOLS_FIELD_INDEX = {field: i for i, field in  enumerate(REDITOOLS_FIELDS)}
 
+# Jacusa output file fields in the extended BED6 format
+JACUSA_FIELDS = ["contig", "start", "end", "name", "score", "strand", "bases11", "info", "filter", "ref"]
+JACUSA_FIELDS_INDEX = {field: i for i, field in enumerate(JACUSA_FIELDS)}
+
+def skip_comments(handle: TextIO, s: str) -> Optional[str]:
+    """
+    Read and return the next line in a text file handle that does not start with the comment prefix `s`.
+    """
+    line: str = handle.readline()
+    
+    while line.startswith(s):
+        line = handle.readline()
+
+    return line
+
+
 class RNAVariantReader(ABC):
     """Abstract class defining the API for readers"""
 
@@ -48,7 +64,8 @@ def read(self) -> Optional[SiteVariantData]:
             strand=self.strand,
             coverage=1,
             mean_quality=30.0,
-            frequencies=self.frequencies
+            frequencies=self.frequencies,
+            score=0.0
         )
         self.position += 1
 
@@ -128,7 +145,8 @@ def _parse_parts(self) -> SiteVariantData:
             strand=strand,
             coverage=int(self.parts[REDITOOLS_FIELD_INDEX["Coverage"]]),
             mean_quality=float(self.parts[REDITOOLS_FIELD_INDEX["MeanQ"]]),
-            frequencies=np.int32(self.parts[REDITOOLS_FIELD_INDEX["Frequencies"]][1:-1].split(",") + [0])
+            frequencies=np.int32(self.parts[REDITOOLS_FIELD_INDEX["Frequencies"]][1:-1].split(",") + [0]),
+            score=0.0
         )
 
     def read(self) -> Optional[SiteVariantData]:
@@ -147,7 +165,6 @@ def close(self) -> None:
         """Close the file"""
         self.file_handle.close()
     
-
 class Reditools2Reader(ReditoolsXReader):
     def parse_strand(self) -> int:
         strand = int(self.parts[REDITOOLS_FIELD_INDEX["Strand"]])
@@ -174,3 +191,46 @@ def parse_strand(self) -> int:
             case _:
                 raise Exception(f"Invalid strand value: {strand_str}")
 
+class Jacusa2Reader():
+    def __init__(self, file_handle: TextIO) -> None:
+        self.file_handle: TextIO = file_handle
+
+        line = skip_comments(self.file_handle, "##")
+
+        # Check the Jacusa header
+        assert line.strip().lstrip('#').split('\t') == JACUSA_FIELDS
+        
+        return None
+    
+    def read(self) -> Optional[SiteVariantData]:
+        line: str = self.file_handle.readline().strip()
+
+        if line == "":
+            return None
+
+        parts: list[str] = line.split('\t')
+
+        reference_nuc_str: str = parts[JACUSA_FIELDS_INDEX["ref"]]
+
+        strand_str: str = parts[JACUSA_FIELDS_INDEX["strand"]]
+
+        match strand_str:
+            case '.':
+                strand = 0
+            case '+':
+                strand = 1
+            case '-':
+                strand -1
+
+        frequencies=np.int32(parts[JACUSA_FIELDS_INDEX["bases11"]].split(',') + [0])
+        
+        return SiteVariantData(
+            seqid=parts[JACUSA_FIELDS_INDEX["contig"]],
+            position=int(parts[JACUSA_FIELDS_INDEX["start"]]),  # Jacusa2 position is 0-based
+            reference=NUC_STR_TO_IND[reference_nuc_str],
+            strand=strand,
+            coverage=sum(frequencies),
+            mean_quality=float("nan"),
+            frequencies=frequencies,
+            score=float(parts[JACUSA_FIELDS_INDEX["score"]])
+        )
diff --git a/bin/stats/utils.py b/bin/stats/utils.py
@@ -49,3 +49,4 @@ class SiteVariantData:
     coverage: int
     mean_quality: float
     frequencies: NDArray[np.int32]
+    score: float
diff --git a/docker/jacusa2/Dockerfile b/docker/jacusa2/Dockerfile
@@ -7,3 +7,6 @@ RUN wget https://github.com/dieterich-lab/JACUSA2/releases/download/v2.0.4/JACUS
 RUN echo "#!/usr/bin/env bash" > /usr/local/bin/jacusa2
 RUN echo "java -jar /usr/local/bin/JACUSA_v2.0.4.jar $@" >> /usr/local/bin/jacusa2
 RUN chmod +x /usr/local/bin/jacusa2
+
+# ⚠️ IMPORTANT
+# Run the Docker image with the option `--ulimit nofile=1024:1024`
diff --git a/modules/jacusa2.nf b/modules/jacusa2.nf
@@ -15,25 +15,29 @@ process jacusa2 {
         base_name = bam.BaseName
         // Set the strand orientation parameter from the library type parameter
         // Terms explained in https://salmon.readthedocs.io/en/latest/library_type.html
-        if (meta.libtype in ["ISR", "SR"]) {
+        if (meta.strandedness in ["ISR", "SR"]) {
             // First-strand oriented
             jacusa_strand_param = "FR_SECONDSTRAND"
-        } else if (meta.libtype in ["ISF", "SF"]) {
+        } else if (meta.strandedness in ["ISF", "SF"]) {
             // Second-strand oriented
             jacusa_strand_param = "RF_FIRSTSTRAND"
-        } else if (meta.libtype in ["IU", "U"]) {
+        } else if (meta.strandedness in ["IU", "U"]) {
             // Unstranded
             jacusa_strand_param = "UNSTRANDED"
         } else {
-            // Unsupported: Pass the library type string so that it's reported in
-            // the Jacusa2 error message
-            jacusa_strand_param = meta.libtype
+            jacusa_strand_param = "UNSTRANDED"
         }
         """
+        # -A    Show all sites, inc. sites without variants
+        # -f    Use BED-6 extended output format
+        # -p    Number of threads
+        # -r    Path of output file
+        # -c    Minimum coverage: 1 read
+        # -s    Store feature-filtered results in another file
         java -Xmx${task.memory.toMega()}M -jar /usr/local/bin/JACUSA_v2.0.4.jar \
             call-1 \
             -A \
-            -f V \
+            -f B \
             -p ${task.cpus} \
             -r ${base_name}.site_edits_jacusa2.txt \
             -c 1 \
diff --git a/rain.nf b/rain.nf
@@ -591,6 +591,8 @@ workflow {
                 // Create a fasta index file of the reference genome
                 samtools_fasta_index(genome.collect())
                 jacusa2(samtools_index.out.tuple_sample_bam_bamindex, samtools_fasta_index.out.tuple_fasta_fastaindex.collect())
+                normalize_gxf(annotation.collect())
+                pluviometer(jacusa2.out.tuple_sample_jacusa2_table, normalize_gxf.out.gff.collect(), "jacusa2")
                 break
             case "sapin":
                 sapin(tuple_sample_bam_processed, genome.collect())