77REDITOOLS_FIELDS = ["Seqid" , "Position" , "Reference" , "Strand" , "Coverage" , "MeanQ" , "Frequencies" ]
88REDITOOLS_FIELD_INDEX = {field : i for i , field in enumerate (REDITOOLS_FIELDS )}
99
10+ # Jacusa output file fields in the extended BED6 format
11+ JACUSA_FIELDS = ["contig" , "start" , "end" , "name" , "score" , "strand" , "bases11" , "info" , "filter" , "ref" ]
12+ JACUSA_FIELDS_INDEX = {field : i for i , field in enumerate (JACUSA_FIELDS )}
13+
14+ def skip_comments (handle : TextIO , s : str ) -> Optional [str ]:
15+ """
16+ Read and return the next line in a text file handle that does not start with the comment prefix `s`.
17+ """
18+ line : str = handle .readline ()
19+
20+ while line .startswith (s ):
21+ line = handle .readline ()
22+
23+ return line
24+
25+
1026class RNAVariantReader (ABC ):
1127 """Abstract class defining the API for readers"""
1228
@@ -48,7 +64,8 @@ def read(self) -> Optional[SiteVariantData]:
4864 strand = self .strand ,
4965 coverage = 1 ,
5066 mean_quality = 30.0 ,
51- frequencies = self .frequencies
67+ frequencies = self .frequencies ,
68+ score = 0.0
5269 )
5370 self .position += 1
5471
@@ -128,7 +145,8 @@ def _parse_parts(self) -> SiteVariantData:
128145 strand = strand ,
129146 coverage = int (self .parts [REDITOOLS_FIELD_INDEX ["Coverage" ]]),
130147 mean_quality = float (self .parts [REDITOOLS_FIELD_INDEX ["MeanQ" ]]),
131- frequencies = np .int32 (self .parts [REDITOOLS_FIELD_INDEX ["Frequencies" ]][1 :- 1 ].split ("," ) + [0 ])
148+ frequencies = np .int32 (self .parts [REDITOOLS_FIELD_INDEX ["Frequencies" ]][1 :- 1 ].split ("," ) + [0 ]),
149+ score = 0.0
132150 )
133151
134152 def read (self ) -> Optional [SiteVariantData ]:
@@ -147,7 +165,6 @@ def close(self) -> None:
147165 """Close the file"""
148166 self .file_handle .close ()
149167
150-
151168class Reditools2Reader (ReditoolsXReader ):
152169 def parse_strand (self ) -> int :
153170 strand = int (self .parts [REDITOOLS_FIELD_INDEX ["Strand" ]])
@@ -174,3 +191,46 @@ def parse_strand(self) -> int:
174191 case _:
175192 raise Exception (f"Invalid strand value: { strand_str } " )
176193
194+ class Jacusa2Reader ():
195+ def __init__ (self , file_handle : TextIO ) -> None :
196+ self .file_handle : TextIO = file_handle
197+
198+ line = skip_comments (self .file_handle , "##" )
199+
200+ # Check the Jacusa header
201+ assert line .strip ().lstrip ('#' ).split ('\t ' ) == JACUSA_FIELDS
202+
203+ return None
204+
205+ def read (self ) -> Optional [SiteVariantData ]:
206+ line : str = self .file_handle .readline ().strip ()
207+
208+ if line == "" :
209+ return None
210+
211+ parts : list [str ] = line .split ('\t ' )
212+
213+ reference_nuc_str : str = parts [JACUSA_FIELDS_INDEX ["ref" ]]
214+
215+ strand_str : str = parts [JACUSA_FIELDS_INDEX ["strand" ]]
216+
217+ match strand_str :
218+ case '.' :
219+ strand = 0
220+ case '+' :
221+ strand = 1
222+ case '-' :
223+ strand - 1
224+
225+ frequencies = np .int32 (parts [JACUSA_FIELDS_INDEX ["bases11" ]].split (',' ) + [0 ])
226+
227+ return SiteVariantData (
228+ seqid = parts [JACUSA_FIELDS_INDEX ["contig" ]],
229+ position = int (parts [JACUSA_FIELDS_INDEX ["start" ]]), # Jacusa2 position is 0-based
230+ reference = NUC_STR_TO_IND [reference_nuc_str ],
231+ strand = strand ,
232+ coverage = sum (frequencies ),
233+ mean_quality = float ("nan" ),
234+ frequencies = frequencies ,
235+ score = float (parts [JACUSA_FIELDS_INDEX ["score" ]])
236+ )
0 commit comments