Skip to content

Commit 0024d53

Browse files
authored
Merge pull request #41 from Juke34/jacusa2
Jacusa2
2 parents 28349e6 + da045b1 commit 0024d53

6 files changed

Lines changed: 84 additions & 11 deletions

File tree

bin/stats/pluviometer.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
import numpy as np
1414
from numpy.typing import NDArray
1515
from typing import TextIO, NamedTuple, Optional, Generator
16-
from site_variant_readers import RNAVariantReader, Reditools2Reader, Reditools3Reader
16+
from site_variant_readers import RNAVariantReader, Reditools2Reader, Reditools3Reader, Jacusa2Reader
1717
import argparse
1818
from contextlib import nullcontext
1919
import sys
@@ -520,6 +520,9 @@ def scan_and_count(self, reader: RNAVariantReader) -> None:
520520
case "reditools3":
521521
print("Reditools3 format\n")
522522
sv_reader: RNAVariantReader = Reditools3Reader(sv_handle)
523+
case "jacusa2":
524+
print("Jacusa2 BED6 extended format")
525+
sv_reader: RNAVariantReader = Jacusa2Reader(sv_handle)
523526
case _:
524527
raise Exception(f'Unimplemented format "{args.format}"')
525528

bin/stats/site_variant_readers.py

Lines changed: 63 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,22 @@
77
REDITOOLS_FIELDS = ["Seqid", "Position", "Reference", "Strand", "Coverage", "MeanQ", "Frequencies"]
88
REDITOOLS_FIELD_INDEX = {field: i for i, field in enumerate(REDITOOLS_FIELDS)}
99

10+
# Jacusa output file fields in the extended BED6 format
11+
JACUSA_FIELDS = ["contig", "start", "end", "name", "score", "strand", "bases11", "info", "filter", "ref"]
12+
JACUSA_FIELDS_INDEX = {field: i for i, field in enumerate(JACUSA_FIELDS)}
13+
14+
def skip_comments(handle: TextIO, s: str) -> Optional[str]:
15+
"""
16+
Read and return the next line in a text file handle that does not start with the comment prefix `s`.
17+
"""
18+
line: str = handle.readline()
19+
20+
while line.startswith(s):
21+
line = handle.readline()
22+
23+
return line
24+
25+
1026
class RNAVariantReader(ABC):
1127
"""Abstract class defining the API for readers"""
1228

@@ -48,7 +64,8 @@ def read(self) -> Optional[SiteVariantData]:
4864
strand=self.strand,
4965
coverage=1,
5066
mean_quality=30.0,
51-
frequencies=self.frequencies
67+
frequencies=self.frequencies,
68+
score=0.0
5269
)
5370
self.position += 1
5471

@@ -128,7 +145,8 @@ def _parse_parts(self) -> SiteVariantData:
128145
strand=strand,
129146
coverage=int(self.parts[REDITOOLS_FIELD_INDEX["Coverage"]]),
130147
mean_quality=float(self.parts[REDITOOLS_FIELD_INDEX["MeanQ"]]),
131-
frequencies=np.int32(self.parts[REDITOOLS_FIELD_INDEX["Frequencies"]][1:-1].split(",") + [0])
148+
frequencies=np.int32(self.parts[REDITOOLS_FIELD_INDEX["Frequencies"]][1:-1].split(",") + [0]),
149+
score=0.0
132150
)
133151

134152
def read(self) -> Optional[SiteVariantData]:
@@ -147,7 +165,6 @@ def close(self) -> None:
147165
"""Close the file"""
148166
self.file_handle.close()
149167

150-
151168
class Reditools2Reader(ReditoolsXReader):
152169
def parse_strand(self) -> int:
153170
strand = int(self.parts[REDITOOLS_FIELD_INDEX["Strand"]])
@@ -174,3 +191,46 @@ def parse_strand(self) -> int:
174191
case _:
175192
raise Exception(f"Invalid strand value: {strand_str}")
176193

194+
class Jacusa2Reader():
195+
def __init__(self, file_handle: TextIO) -> None:
196+
self.file_handle: TextIO = file_handle
197+
198+
line = skip_comments(self.file_handle, "##")
199+
200+
# Check the Jacusa header
201+
assert line.strip().lstrip('#').split('\t') == JACUSA_FIELDS
202+
203+
return None
204+
205+
def read(self) -> Optional[SiteVariantData]:
206+
line: str = self.file_handle.readline().strip()
207+
208+
if line == "":
209+
return None
210+
211+
parts: list[str] = line.split('\t')
212+
213+
reference_nuc_str: str = parts[JACUSA_FIELDS_INDEX["ref"]]
214+
215+
strand_str: str = parts[JACUSA_FIELDS_INDEX["strand"]]
216+
217+
match strand_str:
218+
case '.':
219+
strand = 0
220+
case '+':
221+
strand = 1
222+
case '-':
223+
strand -1
224+
225+
frequencies=np.int32(parts[JACUSA_FIELDS_INDEX["bases11"]].split(',') + [0])
226+
227+
return SiteVariantData(
228+
seqid=parts[JACUSA_FIELDS_INDEX["contig"]],
229+
position=int(parts[JACUSA_FIELDS_INDEX["start"]]), # Jacusa2 position is 0-based
230+
reference=NUC_STR_TO_IND[reference_nuc_str],
231+
strand=strand,
232+
coverage=sum(frequencies),
233+
mean_quality=float("nan"),
234+
frequencies=frequencies,
235+
score=float(parts[JACUSA_FIELDS_INDEX["score"]])
236+
)

bin/stats/utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,3 +49,4 @@ class SiteVariantData:
4949
coverage: int
5050
mean_quality: float
5151
frequencies: NDArray[np.int32]
52+
score: float

docker/jacusa2/Dockerfile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,6 @@ RUN wget https://github.com/dieterich-lab/JACUSA2/releases/download/v2.0.4/JACUS
77
RUN echo "#!/usr/bin/env bash" > /usr/local/bin/jacusa2
88
RUN echo "java -jar /usr/local/bin/JACUSA_v2.0.4.jar $@" >> /usr/local/bin/jacusa2
99
RUN chmod +x /usr/local/bin/jacusa2
10+
11+
# ⚠️ IMPORTANT
12+
# Run the Docker image with the option `--ulimit nofile=1024:1024`

modules/jacusa2.nf

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,25 +15,29 @@ process jacusa2 {
1515
base_name = bam.BaseName
1616
// Set the strand orientation parameter from the library type parameter
1717
// Terms explained in https://salmon.readthedocs.io/en/latest/library_type.html
18-
if (meta.libtype in ["ISR", "SR"]) {
18+
if (meta.strandedness in ["ISR", "SR"]) {
1919
// First-strand oriented
2020
jacusa_strand_param = "FR_SECONDSTRAND"
21-
} else if (meta.libtype in ["ISF", "SF"]) {
21+
} else if (meta.strandedness in ["ISF", "SF"]) {
2222
// Second-strand oriented
2323
jacusa_strand_param = "RF_FIRSTSTRAND"
24-
} else if (meta.libtype in ["IU", "U"]) {
24+
} else if (meta.strandedness in ["IU", "U"]) {
2525
// Unstranded
2626
jacusa_strand_param = "UNSTRANDED"
2727
} else {
28-
// Unsupported: Pass the library type string so that it's reported in
29-
// the Jacusa2 error message
30-
jacusa_strand_param = meta.libtype
28+
jacusa_strand_param = "UNSTRANDED"
3129
}
3230
"""
31+
# -A Show all sites, inc. sites without variants
32+
# -f Use BED-6 extended output format
33+
# -p Number of threads
34+
# -r Path of output file
35+
# -c Minimum coverage: 1 read
36+
# -s Store feature-filtered results in another file
3337
java -Xmx${task.memory.toMega()}M -jar /usr/local/bin/JACUSA_v2.0.4.jar \
3438
call-1 \
3539
-A \
36-
-f V \
40+
-f B \
3741
-p ${task.cpus} \
3842
-r ${base_name}.site_edits_jacusa2.txt \
3943
-c 1 \

rain.nf

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -591,6 +591,8 @@ workflow {
591591
// Create a fasta index file of the reference genome
592592
samtools_fasta_index(genome.collect())
593593
jacusa2(samtools_index.out.tuple_sample_bam_bamindex, samtools_fasta_index.out.tuple_fasta_fastaindex.collect())
594+
normalize_gxf(annotation.collect())
595+
pluviometer(jacusa2.out.tuple_sample_jacusa2_table, normalize_gxf.out.gff.collect(), "jacusa2")
594596
break
595597
case "sapin":
596598
sapin(tuple_sample_bam_processed, genome.collect())

0 commit comments

Comments
 (0)