From b39bc7d083545e1fcc34982401b2eeedfe5450f4 Mon Sep 17 00:00:00 2001 From: Gemy Kaithakottil Date: Fri, 28 Nov 2025 14:29:46 +0000 Subject: [PATCH 01/10] chore: format with ruff --- Mikado/parsers/bed12.py | 1016 ++++++++++++++++++++++++++------------- 1 file changed, 685 insertions(+), 331 deletions(-) diff --git a/Mikado/parsers/bed12.py b/Mikado/parsers/bed12.py index a28cad7f..7c95ffe4 100644 --- a/Mikado/parsers/bed12.py +++ b/Mikado/parsers/bed12.py @@ -40,11 +40,13 @@ codons = copy.deepcopy(CodonTable.ambiguous_dna_by_id[1]._codon_table) codons.start_codons = ["ATG"] -standard = CodonTable.AmbiguousCodonTable(codons, - IUPACData.ambiguous_dna_letters, - IUPACData.ambiguous_dna_values, - IUPACData.extended_protein_letters, - IUPACData.extended_protein_values) +standard = CodonTable.AmbiguousCodonTable( + codons, + IUPACData.ambiguous_dna_letters, + IUPACData.ambiguous_dna_values, + IUPACData.extended_protein_letters, + IUPACData.extended_protein_values, +) assert standard.start_codons == ["ATG"] assert CodonTable.ambiguous_dna_by_id[1].start_codons != ["ATG"] @@ -66,16 +68,20 @@ def get_tables(table, to_stop=False, gap=None, stop_symbol="*"): if dual_coding: c = dual_coding[0] if to_stop: - raise ValueError("You cannot use 'to_stop=True' with this table " - "as it contains {} codon(s) which can be both " - " STOP and an amino acid (e.g. '{}' -> '{}' or " - "STOP)." - .format(len(dual_coding), c, forward_table[c])) - warnings.warn("This table contains {} codon(s) which code(s) for both " - "STOP and an amino acid (e.g. '{}' -> '{}' or STOP). " - "Such codons will be translated as amino acid." - .format(len(dual_coding), c, forward_table[c]), - BiopythonWarning) + raise ValueError( + "You cannot use 'to_stop=True' with this table " + "as it contains {} codon(s) which can be both " + " STOP and an amino acid (e.g. '{}' -> '{}' or " + "STOP).".format(len(dual_coding), c, forward_table[c]) + ) + warnings.warn( + "This table contains {} codon(s) which code(s) for both " + "STOP and an amino acid (e.g. '{}' -> '{}' or STOP). " + "Such codons will be translated as amino acid.".format( + len(dual_coding), c, forward_table[c] + ), + BiopythonWarning, + ) for stop in stop_codons: forward_table[stop] = stop_symbol @@ -88,7 +94,9 @@ def get_tables(table, to_stop=False, gap=None, stop_symbol="*"): return forward_table, getter, valid_letters -def _translate_str(sequence, table, stop_symbol="*", to_stop=False, cds=False, pos_stop="X", gap=None): +def _translate_str( + sequence, table, stop_symbol="*", to_stop=False, cds=False, pos_stop="X", gap=None +): """Translate nucleotide string into a protein string (PRIVATE). Arguments: @@ -153,33 +161,50 @@ def _translate_str(sequence, table, stop_symbol="*", to_stop=False, cds=False, p # Check that the pos_stop is a single character # By default this is the "X" character (equivalent to "N" for nucleotides) if not (isinstance(pos_stop, (bytes, str)) and len(pos_stop) == 1): - raise ValueError("Pos_stop must be a single character, not {pos_stop}".format(pos_stop=pos_stop)) + raise ValueError( + "Pos_stop must be a single character, not {pos_stop}".format( + pos_stop=pos_stop + ) + ) if isinstance(pos_stop, bytes): pos_stop = pos_stop.decode() if cds and len(sequence) % 3 != 0: - raise CodonTable.TranslationError("Sequence length {0} is not a multiple of three".format( - len(sequence) - )) + raise CodonTable.TranslationError( + "Sequence length {0} is not a multiple of three".format(len(sequence)) + ) elif gap is not None and (not isinstance(gap, str) or len(gap) > 1): raise ValueError("Gap character should be a single character string.") - forward_table, getter, valid_letters = get_tables(table, to_stop=to_stop, gap=gap, stop_symbol=stop_symbol) + forward_table, getter, valid_letters = get_tables( + table, to_stop=to_stop, gap=gap, stop_symbol=stop_symbol + ) sequence = sequence.upper() if not valid_letters.issuperset(set(sequence)): - raise CodonTable.TranslationError("Invalid letters in the sequence: {}".format( - set.difference(set(sequence), valid_letters) - )) - - amino_acids = getter(np.array( - [sequence[start:start + 3] for start in range(0, len(sequence) - len(sequence) % 3, 3)], dtype=" 1: raise CodonTable.TranslationError( - "Extra in-frame stop codon found. Sequence: {sequence}".format(sequence=sequence)) + "Extra in-frame stop codon found. Sequence: {sequence}".format( + sequence=sequence + ) + ) elif cds and found_stops and _stop_locations[0] < len(amino_acids) - 1: raise CodonTable.TranslationError( "Extra in-frame stop codon. Sequence:\n{sequence}\n{spaces}^^^".format( - sequence=sequence, spaces=" " * _stop_locations[0] * 3)) + sequence=sequence, spaces=" " * _stop_locations[0] * 3 + ) + ) if to_stop and found_stops > 0: - amino_acids = amino_acids[:_stop_locations[0]] + amino_acids = amino_acids[: _stop_locations[0]] return "".join(amino_acids) @@ -202,7 +232,6 @@ def _translate_str(sequence, table, stop_symbol="*", to_stop=False, cds=False, p # These classes do contain lots of things, it is correct like it is # pylint: disable=too-many-instance-attributes class BED12: - """ BED12 parsing class. """ @@ -211,18 +240,20 @@ class BED12: _attribute_pattern = re.compile(r"([^;]*)=([^$=]*)(?:;|$)") - def __init__(self, *args: Union[str, list, tuple, GffLine], - fasta_index=None, - phase=None, - sequence=None, - transcriptomic=False, - max_regression=0, - start_adjustment=True, - coding=True, - lenient=False, - table=0, - logger=create_null_logger()): - + def __init__( + self, + *args: Union[str, list, tuple, GffLine], + fasta_index=None, + phase=None, + sequence=None, + transcriptomic=False, + max_regression=0, + start_adjustment=True, + coding=True, + lenient=False, + table=0, + logger=create_null_logger(), + ): """ :param args: the BED12 line. :type args: (str, list, tuple, GffLine) @@ -324,7 +355,7 @@ def __init__(self, *args: Union[str, list, tuple, GffLine], self.name = "" self.score = 0 self.strand = None - self.rgb = '' + self.rgb = "" self.stop_codon = self.start_codon = None self.__has_start = self.__has_stop = False self.__block_sizes = np.zeros(1, dtype=np.int64) @@ -356,7 +387,7 @@ def __init__(self, *args: Union[str, list, tuple, GffLine], self._line = args[0] if isinstance(self._line, str) or self._line is None: if self._line is None: - self._line = '' + self._line = "" self._line = self._line.rstrip() if len(self._line) == 0 or self._line[0] == "#": self.header = True @@ -384,7 +415,9 @@ def __init__(self, *args: Union[str, list, tuple, GffLine], fasta_length = len(sequence) elif fasta_index: if isinstance(fasta_index, pysam.FastaFile): - fasta_length = fasta_index.get_reference_length(self._line.chrom) + fasta_length = fasta_index.get_reference_length( + self._line.chrom + ) elif isinstance(fasta_index, pyfaidx.Fasta): sequence = fasta_index[self._line.chrom] fasta_length = len(fasta_index[self._line.chrom]) @@ -395,7 +428,9 @@ def __init__(self, *args: Union[str, list, tuple, GffLine], self.__set_values_from_gff(fasta_length) elif not (isinstance(self._line, list) or isinstance(self._line, tuple)): - raise InvalidParsingFormat("I need an ordered array, not {0}".format(type(self._line))) + raise InvalidParsingFormat( + "I need an ordered array, not {0}".format(type(self._line)) + ) else: self._fields = self._line print("Line", self._fields) @@ -404,7 +439,11 @@ def __init__(self, *args: Union[str, list, tuple, GffLine], self.__check_validity(transcriptomic, fasta_index, sequence) if self.invalid and self.coding: - self.logger.debug("%s cannot be coding as it is invalid (reason: %s)", self.chrom, self.invalid_reason) + self.logger.debug( + "%s cannot be coding as it is invalid (reason: %s)", + self.chrom, + self.invalid_reason, + ) self.coding = False if self.coding and self.phase is None: @@ -438,7 +477,9 @@ def table(self): @table.setter def table(self, table): - if isinstance(table, bool): # Boolean can be considered as int so this requires special handling + if isinstance( + table, bool + ): # Boolean can be considered as int so this requires special handling raise ValueError(f"Invalid table specified: {table} (type {type(table)})") elif table is not None and not isinstance(table, (int, float, bytes, str)): raise ValueError(f"Invalid table specified: {table} (type {type(table)})") @@ -461,16 +502,21 @@ def table(self, table): self.__table_index = 0 elif isinstance(table, int): if table not in ambiguous_dna_by_id.keys(): - raise ValueError(f"Invalid table code specified: {table}. Available codes: " - f"{', '.join([str(_) for _ in ambiguous_dna_by_id.keys()])}") + raise ValueError( + f"Invalid table code specified: {table}. Available codes: " + f"{', '.join([str(_) for _ in ambiguous_dna_by_id.keys()])}" + ) self.__table = ambiguous_dna_by_id[table] - assert self.__table.start_codons == ["ATG"] if table == 0 else True, f"Invalid codons for table 0: " \ - f"{self.__table.start_codons}" + assert self.__table.start_codons == ["ATG"] if table == 0 else True, ( + f"Invalid codons for table 0: {self.__table.start_codons}" + ) self.__table_index = table elif isinstance(table, str): if table not in ambiguous_dna_by_name.keys(): - raise ValueError(f"Invalid table name specified: {table}. Available table: " - f"{', '.join([str(_) for _ in ambiguous_dna_by_name.keys()])}") + raise ValueError( + f"Invalid table name specified: {table}. Available table: " + f"{', '.join([str(_) for _ in ambiguous_dna_by_name.keys()])}" + ) self.__table = ambiguous_dna_by_name[table] self.__table_index = ambiguous_dna_by_name[table].id return @@ -482,11 +528,15 @@ def parent(self, parent): self.__parent = [parent] def __getstate__(self): - - state = copy.deepcopy(dict((key, val) for key, val in self.__dict__.items() - if key not in ("_BED12_table") and - not isinstance(val, logging.Logger) and - not isinstance(val, CodonTable.CodonTable))) + state = copy.deepcopy( + dict( + (key, val) + for key, val in self.__dict__.items() + if key not in ("_BED12_table") + and not isinstance(val, logging.Logger) + and not isinstance(val, CodonTable.CodonTable) + ) + ) return state @@ -496,7 +546,6 @@ def __setstate__(self, state): self.table = self.__table_index def _parse_attributes(self, attributes): - """ Private method that parses the last field of the GFF line. :return: @@ -526,15 +575,24 @@ def _parse_attributes(self, attributes): continue def __set_values_from_fields(self): - """ Private method that sets the correct values from the fields derived from the input line. :return: """ - self.chrom, self.start, self.end, \ - self.name, self.score, self.strand, \ - self.thick_start, self.thick_end, self.rgb, \ - self.block_count, block_sizes, block_starts = self._fields[:12] + ( + self.chrom, + self.start, + self.end, + self.name, + self.score, + self.strand, + self.thick_start, + self.thick_end, + self.rgb, + self.block_count, + block_sizes, + block_starts, + ) = self._fields[:12] # Reduce memory usage intern(self.chrom) @@ -567,7 +625,6 @@ def __set_values_from_fields(self): return def __set_values_from_bed12(self, line): - self.__setstate__(line.__getstate__()) return @@ -577,19 +634,24 @@ def __set_values_from_gff(self, fasta_length): :return: """ - (self.chrom, self.thick_start, - self.thick_end, self.strand, self.name) = (self._line.chrom, - self._line.start, - self._line.end, self._line.strand, self._line.id) + (self.chrom, self.thick_start, self.thick_end, self.strand, self.name) = ( + self._line.chrom, + self._line.start, + self._line.end, + self._line.strand, + self._line.id, + ) intern(self.chrom) if self.name is None: - raise InvalidParsingFormat("{self} should have the name property defined".format(self=repr(self))) + raise InvalidParsingFormat( + "{self} should have the name property defined".format(self=repr(self)) + ) self.start = 1 self.end = fasta_length self.score = self._line.score self.rgb = None self.block_count = 1 - self.block_sizes = [self.thick_end - self.thick_start +1] + self.block_sizes = [self.thick_end - self.thick_start + 1] self.block_starts = [self.thick_start] self.has_start_codon = False self.has_stop_codon = False @@ -609,15 +671,21 @@ def __check_validity(self, transcriptomic, fasta_index, sequence): if transcriptomic is True and self.coding is True: if not (fasta_index is not None or sequence is not None): - self.logger.debug("No further check on the validity of %s as no sequence has been provided.", - self.chrom) + self.logger.debug( + "No further check on the validity of %s as no sequence has been provided.", + self.chrom, + ) return if transcriptomic is True: self.has_start_codon = False self.has_stop_codon = False - if transcriptomic is True and self.coding is True and (fasta_index is not None or sequence is not None): + if ( + transcriptomic is True + and self.coding is True + and (fasta_index is not None or sequence is not None) + ): self.logger.debug("Starting to check the validity of %s", self.chrom) self.validity_checked = True if sequence is not None: @@ -628,7 +696,9 @@ def __check_validity(self, transcriptomic, fasta_index, sequence): sequence = str(sequence) else: if self.id not in fasta_index: - self.logger.warning("%s not found in the index. Aborting the check, we will trust the ORF as-is.") + self.logger.warning( + "%s not found in the index. Aborting the check, we will trust the ORF as-is." + ) self.__in_index = False return self.fasta_length = len(fasta_index[self.id]) @@ -647,17 +717,32 @@ def __check_validity(self, transcriptomic, fasta_index, sequence): if self.strand != "-": orf_sequence = sequence[ - (self.thick_start - 1 if not self.phase else self.start + self.phase - 1):self.thick_end] + ( + self.thick_start - 1 + if not self.phase + else self.start + self.phase - 1 + ) : self.thick_end + ] else: orf_sequence = Seq.reverse_complement( - sequence[(self.thick_start - 1):( - self.thick_end if not self.phase else self.end - (3 - self.phase) % 3)]) + sequence[ + (self.thick_start - 1) : ( + self.thick_end + if not self.phase + else self.end - (3 - self.phase) % 3 + ) + ] + ) self.start_codon = str(orf_sequence)[:3].upper() self.stop_codon = str(orf_sequence[-3:]).upper() - if self.start_codon in self.table.start_codons and (self.phase is None or self.phase == 0): - self.logger.debug("Found start codon for %s. Setting phase to 0", self.chrom) + if self.start_codon in self.table.start_codons and ( + self.phase is None or self.phase == 0 + ): + self.logger.debug( + "Found start codon for %s. Setting phase to 0", self.chrom + ) self.has_start_codon = True self.phase = 0 else: @@ -674,25 +759,39 @@ def __check_validity(self, transcriptomic, fasta_index, sequence): elif self.strand == "-" and self.thick_start - self.start < 3: self.thick_start = 1 - self.logger.debug("%s with start codon (%s) and stop codon (%s). Valid: %s", - self.chrom, self.has_start_codon, self.has_stop_codon, not self.invalid) + self.logger.debug( + "%s with start codon (%s) and stop codon (%s). Valid: %s", + self.chrom, + self.has_start_codon, + self.has_stop_codon, + not self.invalid, + ) # Get only a proper multiple of three if self.lenient is False and self.coding is True: if self.strand != "-": orf_sequence = sequence[ - (self.thick_start - 1 if not self.phase - else self.start + self.phase - 1):self.thick_end] + ( + self.thick_start - 1 + if not self.phase + else self.start + self.phase - 1 + ) : self.thick_end + ] else: orf_sequence = Seq.reverse_complement( sequence[ - (self.thick_start - 1): - (self.thick_end if not self.phase else self.end - self.phase)]) + (self.thick_start - 1) : ( + self.thick_end + if not self.phase + else self.end - self.phase + ) + ] + ) last_pos = -3 - ((len(orf_sequence)) % 3) - translated_seq = _translate_str(orf_sequence[:last_pos], - table=self.table, - gap='N') + translated_seq = _translate_str( + orf_sequence[:last_pos], table=self.table, gap="N" + ) self._internal_stop_codons = str(translated_seq).count("*") if self._internal_stop_codons == 0 and len(orf_sequence[last_pos:]) > 3: @@ -702,25 +801,31 @@ def __check_validity(self, transcriptomic, fasta_index, sequence): if self.strand == "-": self.thick_start += -last_pos % 3 self.logger.warning( - f"Shifting the position of the thick start of {self.name} by {-last_pos % 3}") + f"Shifting the position of the thick start of {self.name} by {-last_pos % 3}" + ) else: self.thick_end -= -last_pos % 3 self.logger.warning( - f"Shifting the position of the thick end of {self.name} by {-last_pos % 3}") + f"Shifting the position of the thick end of {self.name} by {-last_pos % 3}" + ) del self.invalid if self.__is_invalid() is True: return def _adjust_start(self, sequence, orf_sequence): - if len(orf_sequence) != (self.thick_end - self.thick_start + 1 - self.phase): # We are checking that the sequence of the ORF (provided as argument) is the same length as # the imputed length of the ORF - raise ValueError("The provided orf_sequence of length {lorf} is different from the imputed length of the\ + raise ValueError( + "The provided orf_sequence of length {lorf} is different from the imputed length of the\ ORF for {sid} (total {ltotal}; thick start {self.thick_start}, thick end {self.thick_end}, phase {self.phase})".format( - lorf=len(orf_sequence), ltotal=self.thick_end - self.thick_start + 1 - self.phase, - sid=self.name, self=self)) + lorf=len(orf_sequence), + ltotal=self.thick_end - self.thick_start + 1 - self.phase, + sid=self.name, + self=self, + ) + ) # Let's check UPstream first. # This means that we DO NOT have a starting Met and yet we are starting far upstream. @@ -728,11 +833,19 @@ def _adjust_start(self, sequence, orf_sequence): if self.strand == "+" and self.thick_start > 3: for pos in range(self.thick_start, 3, -3): self.thick_start -= 3 - codon = sequence[pos - 4:pos - 1] - is_start, is_stop = ((codon in self.table.start_codons), - (codon in self.table.stop_codons)) - self.logger.debug("Checking pos %s (%s) for %s, start: %s; stop: %s", - pos, codon, self.chrom, is_start, is_stop) + codon = sequence[pos - 4 : pos - 1] + is_start, is_stop = ( + (codon in self.table.start_codons), + (codon in self.table.stop_codons), + ) + self.logger.debug( + "Checking pos %s (%s) for %s, start: %s; stop: %s", + pos, + codon, + self.chrom, + is_start, + is_stop, + ) if is_start: # We have found a valid methionine. break @@ -742,20 +855,27 @@ def _adjust_start(self, sequence, orf_sequence): assert self.invalid is True self.logger.debug( "Found in-frame stop codon for %s while expanding, stopping here. Invalid: %s (reason %s)", - self.chrom, self.invalid, self.invalid_reason) + self.chrom, + self.invalid, + self.invalid_reason, + ) break continue elif self.strand == "-" and self.end - self.thick_end > 3: for pos in range(self.thick_end, self.end - 3, 3): self.thick_end += 3 - codon = Seq.reverse_complement(sequence[pos - 3:pos]) - is_start, is_stop = ((codon in self.table.start_codons), - (codon in self.table.stop_codons)) + codon = Seq.reverse_complement(sequence[pos - 3 : pos]) + is_start, is_stop = ( + (codon in self.table.start_codons), + (codon in self.table.stop_codons), + ) if is_start: # We have found a valid methionine. - self.logger.debug("Found correct start codon for %s while expanding, stopping here.", - self.chrom) + self.logger.debug( + "Found correct start codon for %s while expanding, stopping here.", + self.chrom, + ) break elif is_stop: self.stop_codon = codon @@ -763,7 +883,10 @@ def _adjust_start(self, sequence, orf_sequence): assert self.invalid is True self.logger.debug( "Found in-frame stop codon for %s while expanding, stopping here. Invalid: %s (reason %s)", - self.chrom, self.invalid, self.invalid_reason) + self.chrom, + self.invalid, + self.invalid_reason, + ) break else: self._regression(orf_sequence) @@ -786,30 +909,45 @@ def _adjust_start(self, sequence, orf_sequence): else: self.phase = 0 else: - self.logger.debug("Setting phase of %s at 0 (end: %s; thick end: %s; thick start %s)", - self.chrom, self.end, self.thick_end, self.thick_start) + self.logger.debug( + "Setting phase of %s at 0 (end: %s; thick end: %s; thick start %s)", + self.chrom, + self.end, + self.thick_end, + self.thick_start, + ) self.phase = 0 del self.invalid if self.invalid: - self.logger.debug("%s is not coding after checking. Reason: %s", self.chrom, self.invalid_reason) + self.logger.debug( + "%s is not coding after checking. Reason: %s", + self.chrom, + self.invalid_reason, + ) self.coding = False def _regression(self, orf_sequence): self.logger.debug( "Starting the regression algorithm to find an internal start for %s (end: %s; thick start/end: %s, %s; phase %s)", - self.chrom, self.end, self.thick_start, self.thick_end, self.phase) + self.chrom, + self.end, + self.thick_start, + self.thick_end, + self.phase, + ) if self.strand != "-": # self.thick_start = self.phase + 3 - self.logger.debug("Starting to analyse %s; positions %s-%s", - self.chrom, - self.phase + 3, - self.phase + 3 + int(len(orf_sequence) * self.max_regression), - ) - for pos in range(self.phase + 3, - int(len(orf_sequence) * self.max_regression), - 3): - codon = orf_sequence[pos:pos + 3] + self.logger.debug( + "Starting to analyse %s; positions %s-%s", + self.chrom, + self.phase + 3, + self.phase + 3 + int(len(orf_sequence) * self.max_regression), + ) + for pos in range( + self.phase + 3, int(len(orf_sequence) * self.max_regression), 3 + ): + codon = orf_sequence[pos : pos + 3] # self.logger.debug("Testing position %s-%s (%s)", pos, pos + 3, codon) if codon in self.table.start_codons: # Now we have to shift the start accordingly @@ -819,20 +957,26 @@ def _regression(self, orf_sequence): break else: continue - self.logger.debug("Final internal coords for %s: %s-%s", self.chrom, self.thick_start, self.thick_end) + self.logger.debug( + "Final internal coords for %s: %s-%s", + self.chrom, + self.thick_start, + self.thick_end, + ) elif self.strand == "-": if self.end - self.thick_end < 3: self.phase = (3 - (self.end - self.thick_end) % 3) % 3 - self.logger.debug("Starting to analyse %s (phase %s); positions %s-%s", - self.chrom, - self.phase, - self.phase + 3, - self.phase + 3 + int(len(orf_sequence) * self.max_regression), - ) - for pos in range(self.phase + 3, - int(len(orf_sequence) * self.max_regression), - 3): - codon = orf_sequence[pos:pos + 3] + self.logger.debug( + "Starting to analyse %s (phase %s); positions %s-%s", + self.chrom, + self.phase, + self.phase + 3, + self.phase + 3 + int(len(orf_sequence) * self.max_regression), + ) + for pos in range( + self.phase + 3, int(len(orf_sequence) * self.max_regression), 3 + ): + codon = orf_sequence[pos : pos + 3] # self.logger.debug("Testing position %s-%s (%s)", pos, pos + 3, codon) if codon in self.table.start_codons: # Now we have to shift the start accordingly @@ -840,7 +984,12 @@ def _regression(self, orf_sequence): self.thick_end -= pos self.phase = 0 break - self.logger.debug("Final internal coords for %s: %s-%s", self.chrom, self.thick_start, self.thick_end) + self.logger.debug( + "Final internal coords for %s: %s-%s", + self.chrom, + self.thick_start, + self.thick_end, + ) def __repr__(self): return pp.saferepr(self.__dict__) @@ -849,7 +998,6 @@ def __hash__(self): return hash(frozenset(self.__getstate__())) def __str__(self): - if self.header is True: if self._line is not None: return self._line @@ -879,8 +1027,12 @@ def __str__(self): line.append(self.block_count) line.append(",".join([str(x) for x in self.block_sizes])) line.append(",".join([str(x) for x in self.block_starts])) - attributes = dict((key.lower(), val) for key, val in self.attributes.items() if key.lower() not in - ("geneid", "gene_id", "name", "phase", "coding", "alias", "id")) + attributes = dict( + (key.lower(), val) + for key, val in self.attributes.items() + if key.lower() + not in ("geneid", "gene_id", "name", "phase", "coding", "alias", "id") + ) if self.parent is not None: attributes["Parent"] = self.parent[0] assert "Parent" in attributes @@ -892,14 +1044,25 @@ def __str__(self): parent = None attributes["Parent"] = parent if attributes: - line.append(";".join(f"{key}={val}" for key, val in attributes.items() if val is not None)) + line.append( + ";".join( + f"{key}={val}" for key, val in attributes.items() if val is not None + ) + ) return "\t".join([str(x) for x in line]) def __eq__(self, other): - for key in ["chrom", "strand", "start", - "end", "thick_start", "thick_end", - "block_count", "block_sizes", - "block_starts"]: + for key in [ + "chrom", + "strand", + "start", + "end", + "thick_start", + "thick_end", + "block_count", + "block_sizes", + "block_starts", + ]: if getattr(self, key) != getattr(other, key): return False return True @@ -908,11 +1071,9 @@ def __len__(self): return self.end - self.start + 1 def copy(self): - return copy.deepcopy(self) def as_simple_dict(self): - return { "chrom": self.chrom, "id": self.id, @@ -1024,6 +1185,7 @@ def id(self): return self.chrom else: return self.name + # pylint: enable=invalid-name @property @@ -1047,9 +1209,10 @@ def lenient(self): return self.__lenient def __is_invalid(self): - if self._internal_stop_codons >= 1: - self.invalid_reason = "{} internal stop codons found".format(self._internal_stop_codons) + self.invalid_reason = "{} internal stop codons found".format( + self._internal_stop_codons + ) return True if self.fasta_length is None: @@ -1060,12 +1223,14 @@ def __is_invalid(self): pass else: invalid = "thickStart {0} self.end) + self.invalid_reason = invalid.format( + self.thick_start, + self.start, + self.thick_start < self.start, + self.end, + self.thick_end, + self.thick_end > self.end, + ) return True if self.transcriptomic is True: @@ -1075,8 +1240,7 @@ def __is_invalid(self): if len(self) != self.fasta_length: self.invalid_reason = "FASTA length != BED length: {0} vs. {1}".format( - self.fasta_length, - len(self) + self.fasta_length, len(self) ) return True @@ -1085,19 +1249,29 @@ def __is_invalid(self): else: if (self.cds_len - self.phase) % 3 != 0: if self.strand == "+" and self.thick_end != self.end: - self.invalid_reason = "Invalid CDS length: {0} % 3 = {1} ({2}-{3}, {4})".format( - self.cds_len - self.phase, - (self.cds_len - self.phase) % 3, - self.thick_start, self.thick_end, self.phase) + self.invalid_reason = ( + "Invalid CDS length: {0} % 3 = {1} ({2}-{3}, {4})".format( + self.cds_len - self.phase, + (self.cds_len - self.phase) % 3, + self.thick_start, + self.thick_end, + self.phase, + ) + ) return True elif self.strand == "-" and self.thick_start != self.start: - self.invalid_reason = "Invalid CDS length: {0} % 3 = {1} ({2}-{3}, {4})".format( - self.cds_len - self.phase, - (self.cds_len - self.phase) % 3, - self.thick_start, self.thick_end, self.phase) + self.invalid_reason = ( + "Invalid CDS length: {0} % 3 = {1} ({2}-{3}, {4})".format( + self.cds_len - self.phase, + (self.cds_len - self.phase) % 3, + self.thick_start, + self.thick_end, + self.phase, + ) + ) return True - self.invalid_reason = '' + self.invalid_reason = "" return False @property @@ -1133,7 +1307,9 @@ def start(self, value): try: value = int(value) except (ValueError, TypeError): - raise ValueError("Start must be an integer, not {}! Value: {}".format(type(value), value)) + raise ValueError( + "Start must be an integer, not {}! Value: {}".format(type(value), value) + ) self.__start = value del self.invalid @@ -1151,7 +1327,9 @@ def end(self, value): try: value = int(value) except (ValueError, TypeError): - raise ValueError("End must be an integer, not {}! Value: {}".format(type(value), value)) + raise ValueError( + "End must be an integer, not {}! Value: {}".format(type(value), value) + ) self.__end = value del self.invalid @@ -1169,7 +1347,11 @@ def thick_start(self, value): try: value = int(value) except (ValueError, TypeError): - raise ValueError("Thick start must be an integer, not {}! Value: {}".format(type(value), value)) + raise ValueError( + "Thick start must be an integer, not {}! Value: {}".format( + type(value), value + ) + ) self.__thick_start = value del self.invalid @@ -1187,7 +1369,11 @@ def thick_end(self, value): try: value = int(value) except (ValueError, TypeError): - raise ValueError("Thick end must be an integer, not {}! Value: {}".format(type(value), value)) + raise ValueError( + "Thick end must be an integer, not {}! Value: {}".format( + type(value), value + ) + ) self.__thick_end = value del self.invalid @@ -1211,10 +1397,12 @@ def phase(self): @phase.setter def phase(self, val): - if val not in (None, 0, 1, 2): - raise ValueError("Invalid frame specified for {}: {}. Must be None or 0, 1, 2".format( - self.name, val)) + raise ValueError( + "Invalid frame specified for {}: {}. Must be None or 0, 1, 2".format( + self.name, val + ) + ) elif self.transcriptomic is True and val not in (0, 1, 2): raise ValueError("A transcriptomic BED cannot have null frame.") del self.invalid @@ -1234,7 +1422,11 @@ def block_count(self, value): try: value = int(value) except (ValueError, TypeError): - raise ValueError("Block count must be an integer, not {}! Value: {}".format(type(value), value)) + raise ValueError( + "Block count must be an integer, not {}! Value: {}".format( + type(value), value + ) + ) self.__block_count = value del self.invalid @@ -1263,9 +1455,11 @@ def block_starts(self): def block_starts(self, starts): starts = np.array(starts) if not issubclass(starts.dtype.type, np.int64): - raise TypeError("Block sizes should be integers! Dtype: {}; array: {}".format( - starts.dtype, starts - )) + raise TypeError( + "Block sizes should be integers! Dtype: {}; array: {}".format( + starts.dtype, starts + ) + ) self.__block_starts = starts del self.invalid @@ -1290,11 +1484,20 @@ def _max_regression(self): def _max_regression(self, value): if not (isinstance(value, (int, float)) and 0 <= value <= 1): raise ValueError( - "Invalid value specified for _max_regression (must be between 0 and 1): {}".format(value)) + "Invalid value specified for _max_regression (must be between 0 and 1): {}".format( + value + ) + ) self.__max_regression = value - def expand(self, sequence, upstream, downstream, expand_orf=False, logger=create_null_logger()): - + def expand( + self, + sequence, + upstream, + downstream, + expand_orf=False, + logger=create_null_logger(), + ): # TODO this needs revising. The expand_orf key does not act as it should, as the thick start and # end are changed even when it is set to False. @@ -1317,102 +1520,159 @@ def expand(self, sequence, upstream, downstream, expand_orf=False, logger=create """ if upstream < 0 or downstream < 0: - raise ValueError("Upstream and downstream must be 0 or positive, not {upstream} and {downstream}".format( - upstream=upstream, downstream=downstream)) + raise ValueError( + "Upstream and downstream must be 0 or positive, not {upstream} and {downstream}".format( + upstream=upstream, downstream=downstream + ) + ) if len(sequence) != len(self) + upstream + downstream: raise ValueError( "When trying to expand the original sequence of length {lself} of {sid} by {upstream} upstream and {downstream} downstream nucleotides, \ the total length ({total}) is different from the length of the provided sequence ({lseq}).".format( - sid=self.id, lself=len(self), upstream=upstream, downstream=downstream, total=len(self) + upstream + downstream, - lseq=len(sequence))) + sid=self.id, + lself=len(self), + upstream=upstream, + downstream=downstream, + total=len(self) + upstream + downstream, + lseq=len(sequence), + ) + ) if len(self) == len(sequence): logger.debug( "The length of the sequence for {sid} is identical to the length of the original object. No action needed.".format( - sid=self.id)) + sid=self.id + ) + ) return if self.transcriptomic is False: - raise ValueError("This is not a transcriptomic BED12, I cannot expand it!\n{sself}".format( - sself=repr(self))) + raise ValueError( + "This is not a transcriptomic BED12, I cannot expand it!\n{sself}".format( + sself=repr(self) + ) + ) if self.strand == "-": raise NotImplementedError( - "{sid} is on the negative strand, I can only expand ORFs on the sense strand".format(sid=self.id)) + "{sid} is on the negative strand, I can only expand ORFs on the sense strand".format( + sid=self.id + ) + ) - old_sequence = sequence[upstream:len(self) + upstream] + old_sequence = sequence[upstream : len(self) + upstream] if len(old_sequence) + upstream + downstream != len(sequence): raise ValueError( "When trying to expand the original sequence of length {lself} of {sid} by {upstream} upstream and {downstream} downstream nucleotides, \ the length of the *imputed* old sequence ({lold}) does not tally up with the new sequence ({lnew})".format( - lself=len(self), sid=self.id, upstream=upstream, downstream=downstream, - lold=len(old_sequence), lnew=len(sequence))) + lself=len(self), + sid=self.id, + upstream=upstream, + downstream=downstream, + lold=len(old_sequence), + lnew=len(sequence), + ) + ) self.fasta_length = len(sequence) # I presume that the sequence is already in the right orientation old_start_pos = self.thick_start + self.phase - 1 old_end_pos = self.thick_end - (self.thick_end - old_start_pos) % 3 old_orf = old_sequence[old_start_pos:old_end_pos].upper() - logger.debug("Old sequence of %s (%s bps): %s[...]%s", self.id, len(old_sequence), - old_sequence[:10], old_sequence[-10:]) - logger.debug("Old ORF of %s (%s bps, phase %s): %s[...]%s", self.id, len(old_orf), self.phase, - old_orf[:10], old_orf[-10:]) + logger.debug( + "Old sequence of %s (%s bps): %s[...]%s", + self.id, + len(old_sequence), + old_sequence[:10], + old_sequence[-10:], + ) + logger.debug( + "Old ORF of %s (%s bps, phase %s): %s[...]%s", + self.id, + len(old_orf), + self.phase, + old_orf[:10], + old_orf[-10:], + ) # TODO: this function should not fail for non-coding transcripts assert len(old_orf) > 0, (old_start_pos, old_end_pos) assert len(old_orf) % 3 == 0, (old_start_pos, old_end_pos) old_pep = _translate_str(old_orf, self.table, gap="N") if "*" in old_pep and old_pep.find("*") < len(old_pep) - 1: - logger.error("Stop codon found within the ORF of %s (pos %s of %s; phase %s). This is invalid!", - self.id, old_pep.find("*"), len(old_pep), self.phase) + logger.error( + "Stop codon found within the ORF of %s (pos %s of %s; phase %s). This is invalid!", + self.id, + old_pep.find("*"), + len(old_pep), + self.phase, + ) self.start_codon = old_orf[:3] self.stop_codon = old_orf[-3:] - logger.debug("%s: start codon %s, old start %s (%s); stop codon %s, old stop %s (%s)", - self.name, self.start_codon, self.thick_start + self.phase, - (self.thick_start + self.phase + upstream), - self.stop_codon, self.thick_end, (self.thick_end + upstream)) + logger.debug( + "%s: start codon %s, old start %s (%s); stop codon %s, old stop %s (%s)", + self.name, + self.start_codon, + self.thick_start + self.phase, + (self.thick_start + self.phase + upstream), + self.stop_codon, + self.thick_end, + (self.thick_end + upstream), + ) # Now expand self.end = len(sequence) self.thick_start += upstream self.thick_end += upstream start_codon = str(self.start_codon).upper() stop_codon = str(self.stop_codon).upper() - self.has_start_codon = (start_codon in self.table.start_codons) - self.has_stop_codon = (stop_codon in self.table.stop_codons) - self.logger.debug("%s has start codon (%s): %s", self.chrom, start_codon, self.has_start_codon) - self.logger.debug("%s has stop codon (%s): %s", self.chrom, stop_codon, self.has_stop_codon) + self.has_start_codon = start_codon in self.table.start_codons + self.has_stop_codon = stop_codon in self.table.stop_codons + self.logger.debug( + "%s has start codon (%s): %s", self.chrom, start_codon, self.has_start_codon + ) + self.logger.debug( + "%s has stop codon (%s): %s", self.chrom, stop_codon, self.has_stop_codon + ) if expand_orf is True and not (self.has_start_codon and self.has_stop_codon): if not self.has_start_codon: - for pos in range(old_start_pos + upstream, - 0, - -3): - codon = sequence[pos:pos + 3].upper() + for pos in range(old_start_pos + upstream, 0, -3): + codon = sequence[pos : pos + 3].upper() self.thick_start = pos + 1 if codon in self.table.start_codons: # self.thick_start = pos self.start_codon = codon self.__has_start = True - logger.debug("Position %d, codon %s. Start codon found.", pos, codon) + logger.debug( + "Position %d, codon %s. Start codon found.", pos, codon + ) break if self.start_codon not in self.table.start_codons: self.phase = (self.thick_start - 1) % 3 - logger.debug("No start codon found for %s. Thick start %s, new phase: %s", - self.id, self.thick_start, self.phase) + logger.debug( + "No start codon found for %s. Thick start %s, new phase: %s", + self.id, + self.thick_start, + self.phase, + ) self.thick_start = 1 else: self.phase = 0 self.__has_start = True - coding_seq = sequence[self.thick_start + self.phase - 1:self.end] + coding_seq = sequence[self.thick_start + self.phase - 1 : self.end] if len(coding_seq) % 3 != 0: # Only get a multiple of three - coding_seq = coding_seq[:-((len(coding_seq)) % 3)] + coding_seq = coding_seq[: -((len(coding_seq)) % 3)] prot_seq = _translate_str(coding_seq, table=self.table, gap="N") if "*" in prot_seq: - self.thick_end = self.thick_start + self.phase - 1 + (1 + prot_seq.find("*")) * 3 - self.stop_codon = coding_seq[prot_seq.find("*") * 3:(1 + prot_seq.find("*")) * 3].upper() + self.thick_end = ( + self.thick_start + self.phase - 1 + (1 + prot_seq.find("*")) * 3 + ) + self.stop_codon = coding_seq[ + prot_seq.find("*") * 3 : (1 + prot_seq.find("*")) * 3 + ].upper() self.__has_stop = True logger.debug("New stop codon for %s: %s", self.name, self.thick_end) @@ -1426,7 +1686,6 @@ def expand(self, sequence, upstream, downstream, expand_orf=False, logger=create @property def blocks(self): - """This will return the coordinates of the blocks, with a 1-offset (as in GFF3)""" # First thing: calculate where each start point will be @@ -1436,9 +1695,15 @@ def blocks(self): return list(zip(_bstarts, _bends)) - def to_transcriptomic(self, sequence=None, fasta_index=None, start_adjustment=False, - lenient=False, alias=None, coding=True): - + def to_transcriptomic( + self, + sequence=None, + fasta_index=None, + start_adjustment=False, + lenient=False, + alias=None, + coding=True, + ): """This method will return a transcriptomic version of the BED12. If the object is already transcriptomic, it will return itself.""" @@ -1462,27 +1727,42 @@ def to_transcriptomic(self, sequence=None, fasta_index=None, start_adjustment=Fa # Check thick start and end are defined - assert tStart is not None and tEnd is not None, f"The thick start, thick end of {self.id} are invalid " \ - f"as they are outside of the defined exons.\nThick start: " \ - f"{self.thick_start}\nThick end: {self.thick_end}\n" \ - f"Exons: {self.blocks}" + assert tStart is not None and tEnd is not None, ( + f"The thick start, thick end of {self.id} are invalid " + f"as they are outside of the defined exons.\nThick start: " + f"{self.thick_start}\nThick end: {self.thick_end}\n" + f"Exons: {self.blocks}" + ) if self.strand == "+": bsizes = self.block_sizes[:] else: bsizes = np.flip(self.block_sizes) - tStart, tEnd = self.block_sizes.sum() - tEnd, self.block_sizes.sum() - tStart + tStart, tEnd = ( + self.block_sizes.sum() - tEnd, + self.block_sizes.sum() - tStart, + ) bstarts = np.concatenate([np.zeros(1, dtype=np.int64), bsizes[:-1].cumsum()]) if not (len(bstarts) == len(bsizes) == self.block_count): - raise ValueError("""In {self.id} ({self.chrom}:{self.start}-{self.end}) there is a discrepancy between block \ + raise ValueError( + """In {self.id} ({self.chrom}:{self.start}-{self.end}) there is a discrepancy between block \ starts (# {lbstarts}, {bstarts}), block sizes (# {lbsizes}, {bsizes}) and block counts (# {self.block_count}). \ -This is invalid""".format(self=self, lbstarts=len(bstarts), lbsizes=len(bsizes), bstarts=bstarts, bsizes=bsizes)) +This is invalid""".format( + self=self, + lbstarts=len(bstarts), + lbsizes=len(bsizes), + bstarts=bstarts, + bsizes=bsizes, + ) + ) if self.coding: - new_name = "ID={};coding={};phase={}".format(self.name.split(";")[0], - self.coding, - self.phase if self.phase is not None else 0) + new_name = "ID={};coding={};phase={}".format( + self.name.split(";")[0], + self.coding, + self.phase if self.phase is not None else 0, + ) else: new_name = "ID={};coding={}".format(self.name.split(";")[0], self.coding) @@ -1492,31 +1772,32 @@ def to_transcriptomic(self, sequence=None, fasta_index=None, start_adjustment=Fa if not self.coding: tStart, tEnd = 0, 1 - new = list((self.name.split(";")[0], - 0, - self.block_sizes.sum(), - new_name, - self.score, - "+")) - - new.extend(list(( - tStart, - tEnd, - self.rgb, - self.block_count, - bsizes, - bstarts - ))) - - new = BED12(new, - phase=self.phase, - sequence=sequence, - coding=self.coding, - fasta_index=fasta_index, - transcriptomic=True, - lenient=lenient, - start_adjustment=start_adjustment) - assert isinstance(new, type(self)), f"The new object is of type {type(new)} instead of {type(self)}!" + new = list( + ( + self.name.split(";")[0], + 0, + self.block_sizes.sum(), + new_name, + self.score, + "+", + ) + ) + + new.extend(list((tStart, tEnd, self.rgb, self.block_count, bsizes, bstarts))) + + new = BED12( + new, + phase=self.phase, + sequence=sequence, + coding=self.coding, + fasta_index=fasta_index, + transcriptomic=True, + lenient=lenient, + start_adjustment=start_adjustment, + ) + assert isinstance(new, type(self)), ( + f"The new object is of type {type(new)} instead of {type(self)}!" + ) return new @property @@ -1544,7 +1825,9 @@ def logger(self, logger): if not isinstance(logger, logging.Logger): raise TypeError( "Objects of type {tself} accept only logging.Logger instances as loggers, not {tlog}!".format( - tself=type(self), tlog=type(logger))) + tself=type(self), tlog=type(logger) + ) + ) self.__logger = logger self.__logger.propagate = False @@ -1556,15 +1839,18 @@ class Bed12Parser(Parser): __annot_type__ = "bed12" - def __init__(self, handle, - fasta_index=None, - transcriptomic=False, - max_regression=0, - start_adjustment=True, - is_gff=False, - coding=False, - logger=create_null_logger(), - table=0): + def __init__( + self, + handle, + fasta_index=None, + transcriptomic=False, + max_regression=0, + start_adjustment=True, + is_gff=False, + coding=False, + logger=create_null_logger(), + table=0, + ): """ Constructor method. :param handle: the input BED file. @@ -1591,7 +1877,7 @@ def __init__(self, handle, self.__closed = False self.header = False self.__table = table - self._is_bed12 = (not is_gff) + self._is_bed12 = not is_gff self.__line_counter = 0 @staticmethod @@ -1599,8 +1885,8 @@ def __set_fasta_index(fasta_index): if isinstance(fasta_index, dict): # check that this is a bona fide dictionary ... assert isinstance( - fasta_index[random.choice(fasta_index.keys())], - Bio.SeqRecord.SeqRecord) + fasta_index[random.choice(fasta_index.keys())], Bio.SeqRecord.SeqRecord + ) elif fasta_index is not None: if isinstance(fasta_index, (str, bytes)): if isinstance(fasta_index, bytes): @@ -1615,14 +1901,23 @@ def __iter__(self): return self def __next__(self, seq=None): - try: if self._is_bed12 is True: return self.bed_next() else: return self.gff_next() - except (ValueError, KeyError, TypeError, UnicodeError, AttributeError, AssertionError, InvalidParsingFormat) as exc: - raise InvalidParsingFormat(f"This is not a valid BED12 file! Exception: {exc}") + except ( + ValueError, + KeyError, + TypeError, + UnicodeError, + AttributeError, + AssertionError, + InvalidParsingFormat, + ) as exc: + raise InvalidParsingFormat( + f"This is not a valid BED12 file! Exception: {exc}" + ) def __getstate__(self): state = super().__getstate__() @@ -1649,17 +1944,29 @@ def bed_next(self): line = next(self._handle) self.__line_counter += 1 try: - bed12 = BED12(line, - fasta_index=self.fasta_index, - transcriptomic=self.transcriptomic, - max_regression=self._max_regression, - coding=self.coding, - table=self.__table, - logger=self.logger, - start_adjustment=self.start_adjustment) - except (ValueError, TypeError, CodonTable.TranslationError, KeyError, InvalidParsingFormat) as exc: + bed12 = BED12( + line, + fasta_index=self.fasta_index, + transcriptomic=self.transcriptomic, + max_regression=self._max_regression, + coding=self.coding, + table=self.__table, + logger=self.logger, + start_adjustment=self.start_adjustment, + ) + except ( + ValueError, + TypeError, + CodonTable.TranslationError, + KeyError, + InvalidParsingFormat, + ) as exc: error = "Invalid line for file {name}, line {counter}:\n{line}\nError: {exc}".format( - name=self.name, counter=self.__line_counter, line=line.rstrip(), exc=exc) + name=self.name, + counter=self.__line_counter, + line=line.rstrip(), + exc=exc, + ) raise InvalidParsingFormat(error) return bed12 @@ -1675,25 +1982,47 @@ def gff_next(self): self.__line_counter += 1 try: gff_line = GffLine(line) - except (ValueError, TypeError, CodonTable.TranslationError, KeyError, InvalidParsingFormat) as exc: + except ( + ValueError, + TypeError, + CodonTable.TranslationError, + KeyError, + InvalidParsingFormat, + ) as exc: error = "Invalid line for file {name}, line {counter}:\n{line}\nError: {exc}".format( - name=self.name, counter=self.__line_counter, line=line.rstrip(), exc=exc) + name=self.name, + counter=self.__line_counter, + line=line.rstrip(), + exc=exc, + ) raise InvalidParsingFormat(error) if gff_line.feature != "CDS": continue # Compatibility with BED12 try: - bed12 = BED12(gff_line, - fasta_index=self.fasta_index, - transcriptomic=self.transcriptomic, - max_regression=self._max_regression, - table=self.__table, - start_adjustment=self.start_adjustment, - logger=self.logger) - except (ValueError, TypeError, CodonTable.TranslationError, KeyError, InvalidParsingFormat) as exc: + bed12 = BED12( + gff_line, + fasta_index=self.fasta_index, + transcriptomic=self.transcriptomic, + max_regression=self._max_regression, + table=self.__table, + start_adjustment=self.start_adjustment, + logger=self.logger, + ) + except ( + ValueError, + TypeError, + CodonTable.TranslationError, + KeyError, + InvalidParsingFormat, + ) as exc: error = "Invalid line for file {name}, line {counter}:\n{line}\nError: {exc}".format( - name=self.name, counter=self.__line_counter, line=line.rstrip(), exc=exc) + name=self.name, + counter=self.__line_counter, + line=line.rstrip(), + exc=exc, + ) raise InvalidParsingFormat(error) # raise NotImplementedError("Still working on this!") return bed12 @@ -1719,7 +2048,10 @@ def _max_regression(self): def _max_regression(self, value): if not (isinstance(value, (int, float)) and 0 <= value <= 1): raise ValueError( - "Invalid value specified for _max_regression (must be between 0 and 1): {}".format(value)) + "Invalid value specified for _max_regression (must be between 0 and 1): {}".format( + value + ) + ) self.__max_regression = value @property @@ -1736,21 +2068,21 @@ def coding(self, coding): class Bed12ParseWrapper(mp.Process): - - def __init__(self, - identifier=None, - rec_queue=None, - return_queue=None, - log_queue=None, - level="DEBUG", - fasta_index=None, - transcriptomic=False, - max_regression=0, - is_gff=False, - coding=False, - start_adjustment=True, - table=0): - + def __init__( + self, + identifier=None, + rec_queue=None, + return_queue=None, + log_queue=None, + level="DEBUG", + fasta_index=None, + transcriptomic=False, + max_regression=0, + is_gff=False, + coding=False, + start_adjustment=True, + table=0, + ): """ :param send_queue: :type send_queue: mp.Queue @@ -1775,8 +2107,8 @@ def __init__(self, if isinstance(fasta_index, dict): # check that this is a bona fide dictionary ... assert isinstance( - fasta_index[random.choice(fasta_index.keys())], - Bio.SeqRecord.SeqRecord) + fasta_index[random.choice(fasta_index.keys())], Bio.SeqRecord.SeqRecord + ) elif fasta_index is not None: if isinstance(fasta_index, (str, bytes)): if isinstance(fasta_index, bytes): @@ -1790,7 +2122,7 @@ def __init__(self, self.__closed = False self.header = False self.__table = table - self._is_bed12 = (not is_gff) + self._is_bed12 = not is_gff def bed_next(self, line, sequence=None): """ @@ -1799,15 +2131,23 @@ def bed_next(self, line, sequence=None): """ try: - bed12 = BED12(line, - logger=self.logger, - sequence=sequence, - transcriptomic=self.transcriptomic, - max_regression=self._max_regression, - start_adjustment=self.start_adjustment, - coding=self.coding, - table=self.__table) - except (ValueError, TypeError, CodonTable.TranslationError, KeyError, InvalidParsingFormat) as exc: + bed12 = BED12( + line, + logger=self.logger, + sequence=sequence, + transcriptomic=self.transcriptomic, + max_regression=self._max_regression, + start_adjustment=self.start_adjustment, + coding=self.coding, + table=self.__table, + ) + except ( + ValueError, + TypeError, + CodonTable.TranslationError, + KeyError, + InvalidParsingFormat, + ) as exc: raise InvalidParsingFormat("Invalid line: {}".format(line)) return bed12 @@ -1819,20 +2159,28 @@ def gff_next(self, line, sequence): try: line = GffLine(line) - except (ValueError, TypeError, CodonTable.TranslationError, KeyError, InvalidParsingFormat) as exc: + except ( + ValueError, + TypeError, + CodonTable.TranslationError, + KeyError, + InvalidParsingFormat, + ) as exc: error = "Invalid line:\n{}".format(line) raise InvalidParsingFormat(error) if line.feature != "CDS": return None # Compatibility with BED12 - bed12 = BED12(line, - logger=self.logger, - sequence=sequence, - transcriptomic=self.transcriptomic, - max_regression=self._max_regression, - start_adjustment=self.start_adjustment, - table=self.__table) + bed12 = BED12( + line, + logger=self.logger, + sequence=sequence, + transcriptomic=self.transcriptomic, + max_regression=self._max_regression, + start_adjustment=self.start_adjustment, + table=self.__table, + ) # raise NotImplementedError("Still working on this!") return bed12 @@ -1868,13 +2216,19 @@ def run(self, *args, **kwargs): if not row or row.header is True: continue if row.invalid is True: - self.logger.warning("Invalid entry, reason: %s\n%s", - row.invalid_reason, - row) + self.logger.warning( + "Invalid entry, reason: %s\n%s", row.invalid_reason, row + ) continue - # self.cache[num] = + # self.cache[num] = self.return_queue.put((num, msgpack.dumps(row.as_simple_dict()))) except AttributeError: pass - except (ValueError, TypeError, CodonTable.TranslationError, KeyError, InvalidParsingFormat) as exc: + except ( + ValueError, + TypeError, + CodonTable.TranslationError, + KeyError, + InvalidParsingFormat, + ) as exc: raise InvalidParsingFormat(line) From a4191362f44cc70985cdececa46025b908788b6d Mon Sep 17 00:00:00 2001 From: Gemy Kaithakottil Date: Fri, 28 Nov 2025 14:50:55 +0000 Subject: [PATCH 02/10] fix: prevent treating NNN as a stop codon #469 --- Mikado/parsers/bed12.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Mikado/parsers/bed12.py b/Mikado/parsers/bed12.py index 7c95ffe4..64c30e21 100644 --- a/Mikado/parsers/bed12.py +++ b/Mikado/parsers/bed12.py @@ -790,7 +790,7 @@ def __check_validity(self, transcriptomic, fasta_index, sequence): last_pos = -3 - ((len(orf_sequence)) % 3) translated_seq = _translate_str( - orf_sequence[:last_pos], table=self.table, gap="N" + orf_sequence[:last_pos], table=self.table ) self._internal_stop_codons = str(translated_seq).count("*") @@ -1598,7 +1598,7 @@ def expand( assert len(old_orf) > 0, (old_start_pos, old_end_pos) assert len(old_orf) % 3 == 0, (old_start_pos, old_end_pos) - old_pep = _translate_str(old_orf, self.table, gap="N") + old_pep = _translate_str(old_orf, self.table) if "*" in old_pep and old_pep.find("*") < len(old_pep) - 1: logger.error( "Stop codon found within the ORF of %s (pos %s of %s; phase %s). This is invalid!", @@ -1665,7 +1665,7 @@ def expand( if len(coding_seq) % 3 != 0: # Only get a multiple of three coding_seq = coding_seq[: -((len(coding_seq)) % 3)] - prot_seq = _translate_str(coding_seq, table=self.table, gap="N") + prot_seq = _translate_str(coding_seq, table=self.table) if "*" in prot_seq: self.thick_end = ( self.thick_start + self.phase - 1 + (1 + prot_seq.find("*")) * 3 From 4cb9770f3e8da8626b56960c692acf997dccb7ea Mon Sep 17 00:00:00 2001 From: Gemy Kaithakottil Date: Fri, 28 Nov 2025 14:52:11 +0000 Subject: [PATCH 03/10] fix: update gh actions --- .github/workflows/python-package.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index a06fb747..dc068fbf 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -23,14 +23,14 @@ jobs: sudo apt update sudo apt install -y build-essential zlib1g-dev zlib1g - uses: actions/checkout@v2 - - uses: actions/cache@v2 + - uses: actions/cache@v4 if: startsWith(runner.os, 'Linux') with: path: ~/.cache/pip key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements.txt') }} restore-keys: | ${{ runner.os }}-pip- - - uses: actions/cache@v2 + - uses: actions/cache@v4 if: startsWith(runner.os, 'macOS') with: path: ~/Library/Caches/pip @@ -39,7 +39,7 @@ jobs: ${{ runner.os }}-pip- - name: Cache conda id: cache-miniconda - uses: actions/cache@v2 + uses: actions/cache@v4 env: CACHE_NUMBER: 0 with: From 9a0fe2fd82c327bbb8ee0b9f0cb03848a15e8871 Mon Sep 17 00:00:00 2001 From: Gemy Kaithakottil Date: Mon, 1 Dec 2025 12:38:15 +0000 Subject: [PATCH 04/10] chore: format with ruff --- Mikado/tests/test_bed12.py | 80 ++++++++++++++++++++++---------------- 1 file changed, 46 insertions(+), 34 deletions(-) diff --git a/Mikado/tests/test_bed12.py b/Mikado/tests/test_bed12.py index f507f46a..6c4d1d44 100644 --- a/Mikado/tests/test_bed12.py +++ b/Mikado/tests/test_bed12.py @@ -5,7 +5,6 @@ class TestTranslate(unittest.TestCase): - """ >>> >>> table = CodonTable.ambiguous_dna_by_id[1] @@ -29,7 +28,6 @@ class TestTranslate(unittest.TestCase): """ def test_from_Bio(self): - self.assertEqual(_translate_str("TAN", standard, pos_stop="X"), "X") for codon, amino in standard.forward_table.forward_table.items(): self.assertEqual(_translate_str(codon, standard), amino) @@ -51,12 +49,16 @@ def test_from_Bio(self): _translate_str("AAACCCTAG", standard, cds=True) with self.assertRaises(CodonTable.TranslationError) as exc: _translate_str("ATGCCCTAGCCCTAG", standard, cds=True) - self.assertTrue(str(exc.exception).startswith("Extra in-frame stop codon found."), - str(exc.exception)) + self.assertTrue( + str(exc.exception).startswith("Extra in-frame stop codon found."), + str(exc.exception), + ) with self.assertRaises(CodonTable.TranslationError) as exc: _translate_str("ATGCCCTAGCCCTAT", standard, cds=True) - self.assertTrue(str(exc.exception).startswith("Extra in-frame stop codon. Sequence:"), - str(exc.exception)) + self.assertTrue( + str(exc.exception).startswith("Extra in-frame stop codon. Sequence:"), + str(exc.exception), + ) for invalid in (10, "AB", b"NT"): with self.assertRaises(ValueError): _translate_str("ATGCCCTAG", standard, cds=True, gap=invalid) @@ -85,12 +87,20 @@ def test_ncbi_standard(self): self.assertEqual(_translate_str("ATGCCCTAG", standard, cds=True), "MP*") self.assertEqual(_translate_str("ATGCCCTAG", standard, to_stop=True), "MP") self.assertEqual(_translate_str("CTGCCCTAG", standard, cds=True), "MP*") - self.assertEqual(_translate_str("CTGCCCTAG", standard, cds=True, to_stop=True), "MP") + self.assertEqual( + _translate_str("CTGCCCTAG", standard, cds=True, to_stop=True), "MP" + ) def test_ambiguous(self): ambigouous = None for key, table in CodonTable.ambiguous_dna_by_id.items(): - amb = 0 < len([c for c in table._codon_table.forward_table.keys() if c in table.stop_codons]) + amb = 0 < len( + [ + c + for c in table._codon_table.forward_table.keys() + if c in table.stop_codons + ] + ) if amb: amb = table break @@ -120,15 +130,12 @@ def test_set_table(self): class Bed12GenToTrans(unittest.TestCase): - def setUp(self): pass def test_positive_mono_transfer(self): - string_bed = "1\t10\t500\ttest\t0\t+\t300\t390\t0\t1\t490\t0" - bed = BED12(string_bed) self.assertFalse(bed.invalid) self.assertFalse(bed.header) @@ -150,10 +157,9 @@ def test_positive_mono_transfer(self): self.assertEqual(tbed.thick_end, 380) self.assertTrue(tbed.has_start_codon) self.assertTrue(tbed.has_stop_codon) - self.assertEqual(seq[tbed.thick_start - 1:tbed.thick_end], "ATG" * 29 + "TGA") + self.assertEqual(seq[tbed.thick_start - 1 : tbed.thick_end], "ATG" * 29 + "TGA") def test_negative_mono_transfer(self): - string_bed = "1\t10\t500\ttest\t0\t-\t300\t390\t0\t1\t490\t0" bed = BED12(string_bed) @@ -179,10 +185,9 @@ def test_negative_mono_transfer(self): self.assertTrue(tbed.has_start_codon) self.assertTrue(tbed.has_stop_codon) - self.assertEqual(seq[tbed.thick_start - 1:tbed.thick_end], "ATG" * 29 + "TGA") + self.assertEqual(seq[tbed.thick_start - 1 : tbed.thick_end], "ATG" * 29 + "TGA") def test_diexonic_pos_transfer(self): - string_bed = "1\t10\t1000\ttest\t0\t+\t80\t920\t0\t2\t190,200\t0,790" bed = BED12(string_bed) self.assertFalse(bed.invalid or bed.header) @@ -206,10 +211,9 @@ def test_diexonic_pos_transfer(self): self.assertTrue(tbed.has_start_codon) self.assertTrue(tbed.has_stop_codon) - self.assertEqual(seq[tbed.thick_start - 1:tbed.thick_end], "ATG" * 79 + "TAA") + self.assertEqual(seq[tbed.thick_start - 1 : tbed.thick_end], "ATG" * 79 + "TAA") def test_diexonic_neg_transfer(self): - string_bed = "1\t10\t1000\ttest\t0\t-\t80\t920\t0\t2\t190,200\t0,790" bed = BED12(string_bed) self.assertFalse(bed.invalid or bed.header) @@ -233,10 +237,9 @@ def test_diexonic_neg_transfer(self): self.assertTrue(tbed.has_start_codon) self.assertTrue(tbed.has_stop_codon) - self.assertEqual(seq[tbed.thick_start - 1:tbed.thick_end], "ATG" * 79 + "TAA") + self.assertEqual(seq[tbed.thick_start - 1 : tbed.thick_end], "ATG" * 79 + "TAA") def test_wheat_1(self): - string_bed = "chr7A\t207087445\t207089574\tTraesCS7A01G235400.1\t0\t-\t207087615\t207088433\t0\t3\t457,393,30\t0,603,2099" string_seq = """CGCGTCGGTGCATCCGGATACGTCGCCTGGGCTACACAATGGCGCTGATCGATTGGATAG AACTGAGTGATGATGCAGAGATTATTGAATTGAGCAGTAGCGAGGAGAATGTCGAAGAAT @@ -272,10 +275,13 @@ def test_wheat_1(self): bed = BED12(string_bed) self.assertFalse(bed.invalid or bed.header) - self.assertEqual(bed.start, 207087445+1) + self.assertEqual(bed.start, 207087445 + 1) self.assertEqual(bed.end, 207089574) self.assertEqual(bed.strand, "-") - self.assertEqual(bed.blocks, [(207087446,207087902), (207088049,207088441), (207089545,207089574) ]) + self.assertEqual( + bed.blocks, + [(207087446, 207087902), (207088049, 207088441), (207089545, 207089574)], + ) self.assertEqual(bed.thick_start, 207087616) self.assertEqual(bed.thick_end, 207088433) @@ -284,11 +290,12 @@ def test_wheat_1(self): self.assertEqual(tbed.thick_end - tbed.thick_start + 1, 672) self.assertEqual(tbed.thick_start, string_seq.index("ATGGCGCTGATCGATTGGA") + 1) self.assertEqual(tbed.thick_start, 39) - self.assertEqual(tbed.thick_end, string_seq.index("CTCGGCAGATAG") + len("CTCGGCAGATAG")) - self.assertEqual(string_cds, string_seq[tbed.thick_start - 1:tbed.thick_end]) + self.assertEqual( + tbed.thick_end, string_seq.index("CTCGGCAGATAG") + len("CTCGGCAGATAG") + ) + self.assertEqual(string_cds, string_seq[tbed.thick_start - 1 : tbed.thick_end]) def test_mono_pos_bed_with_phase(self): - string = "1\t10\t101\tID=test;phase=2;coding=True\t0\t+\t10\t101\t0\t1\t91\t0" seq = "A" + "CGG" * 29 + "TAA" @@ -311,7 +318,6 @@ def test_mono_pos_bed_with_phase(self): self.assertTrue(tbed.transcriptomic) def test_mono_neg_bed_with_phase(self): - string = "1\t10\t101\tID=test;phase=2;coding=True\t0\t-\t10\t101\t0\t1\t91\t0" seq = "A" + "CGG" * 29 + "TAA" @@ -334,7 +340,6 @@ def test_mono_neg_bed_with_phase(self): self.assertTrue(tbed.transcriptomic) def test_diex_pos_bed_with_phase_one(self): - string = "1\t10\t111\tID=test;phase=1;coding=True\t0\t+\t10\t101\t0\t1\t101\t0" seq = "A" + "CGG" * 29 + "TAA" + "A" * 10 @@ -357,7 +362,6 @@ def test_diex_pos_bed_with_phase_one(self): self.assertTrue(tbed.has_stop_codon) def test_diex_neg_bed_with_phase_one(self): - string = "1\t10\t300\tID=test;phase=1;coding=True\t0\t-\t70\t300\t0\t2\t90,100\t0,190" seq = "A" + "CGG" * 42 + "TAA" + "A" * 60 @@ -380,7 +384,6 @@ def test_diex_neg_bed_with_phase_one(self): self.assertTrue(tbed.has_stop_codon) def test_diex_pos_bed_with_phase_two(self): - string = "1\t9\t111\tID=test;phase=2;coding=True\t0\t+\t9\t101\t0\t1\t102\t0" seq = "GA" + "CGG" * 29 + "TAA" + "A" * 10 @@ -403,7 +406,6 @@ def test_diex_pos_bed_with_phase_two(self): self.assertTrue(tbed.has_stop_codon) def test_diex_neg_bed_with_phase_two(self): - string = "1\t10\t301\tID=test;phase=2;coding=True\t0\t-\t70\t301\t0\t2\t90,101\t0,190" seq = "GA" + "CGG" * 42 + "TAA" + "A" * 60 @@ -426,7 +428,6 @@ def test_diex_neg_bed_with_phase_two(self): self.assertTrue(tbed.has_stop_codon) def test_tran_to_bed12_neg(self): - for end, phase in [(299, 0), (300, 1), (301, 2)]: with self.subTest(): t = Transcript() @@ -443,9 +444,20 @@ def test_tran_to_bed12_neg(self): self.assertFalse(r.invalid) def test_touching_exons(self): - - bed12line = ["chr1", 172601, 175626, "ID=foo.1", 100, "-", 172601, 175626, "0,0,0", 3, "199,1281,861,", - "0,199,2164"] + bed12line = [ + "chr1", + 172601, + 175626, + "ID=foo.1", + 100, + "-", + 172601, + 175626, + "0,0,0", + 3, + "199,1281,861,", + "0,199,2164", + ] bed = BED12(bed12line, transcriptomic=False) self.assertFalse(bed.invalid, bed.invalid_reason) t = Transcript(bed) @@ -455,4 +467,4 @@ def test_touching_exons(self): if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main() From 199b690ce1f19d5002fae33500232fd6d5d548e7 Mon Sep 17 00:00:00 2001 From: Gemy Kaithakottil Date: Mon, 1 Dec 2025 12:42:24 +0000 Subject: [PATCH 05/10] feat: update unittest for the NNN change #469 --- Mikado/tests/test_bed12.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Mikado/tests/test_bed12.py b/Mikado/tests/test_bed12.py index 6c4d1d44..1bd32114 100644 --- a/Mikado/tests/test_bed12.py +++ b/Mikado/tests/test_bed12.py @@ -14,6 +14,10 @@ class TestTranslate(unittest.TestCase): '*' >>> _translate_str("TAN", table) 'X' + >>> _translate_str("NNN", table) + 'X' + >>> _translate_str("NNN", table, gap="N") + '*' >>> _translate_str("TAN", table, pos_stop="@") '@' >>> _translate_str("TA?", table) @@ -37,6 +41,8 @@ def test_from_Bio(self): self.assertEqual(_translate_str("TAN", standard, pos_stop="U"), "U") self.assertEqual(_translate_str("TAN", standard, pos_stop=b"U"), "U") + self.assertEqual(_translate_str("NNN", standard), "X") + self.assertEqual(_translate_str("NNN", standard, gap="N"), "*") with self.assertRaises(CodonTable.TranslationError): _translate_str("TA?", standard) From 0b0df26d564ef090efa84ba83bc5b5fd365806e9 Mon Sep 17 00:00:00 2001 From: Gemy Kaithakottil Date: Mon, 1 Dec 2025 13:02:54 +0000 Subject: [PATCH 06/10] fix: update gh actions --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index dc068fbf..13dce218 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -52,7 +52,7 @@ jobs: with: python-version: ${{ matrix.python-version }} miniforge-variant: Mambaforge - miniforge-version: 4.9.2-4 + miniforge-version: latest # mamba-version: "*" # channels: conda-forge, defaults channels: conda-forge, bioconda, defaults, anaconda From e0a12f37c54adeff7734ceddc36824d7b5de844a Mon Sep 17 00:00:00 2001 From: Gemy Kaithakottil Date: Wed, 3 Dec 2025 14:20:13 +0000 Subject: [PATCH 07/10] fix: fix error with libmamba Could not solve for environment specs, update to pysam==0.23.3 and diamond==2.1.16 and removed channels - anaconda, defaults --- environment.yml | 8 +++----- requirements.txt | 2 +- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/environment.yml b/environment.yml index daed53ab..692109a5 100644 --- a/environment.yml +++ b/environment.yml @@ -1,9 +1,7 @@ name: mikado2 channels: - - bioconda - conda-forge - - defaults - - anaconda + - bioconda dependencies: - python>=3.9,<3.11 - cython==0.29.32 @@ -15,7 +13,7 @@ dependencies: - networkx==2.8.7 - numpy==1.23.3 - pandas==1.5.0 - - pysam==0.19.1 + - pysam==0.23.3 - pyyaml==6.0.1 - scipy==1.11.1 - snakemake==6.15.5 @@ -35,5 +33,5 @@ dependencies: - samtools>=1.11 - htslib>=1.11 - prodigal==2.6.3 - - diamond==2.0.11 + - diamond==2.1.16 - portcullis==1.2.4 diff --git a/requirements.txt b/requirements.txt index 563fffc1..dbde9262 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,7 +7,7 @@ msgpack==1.0.4 networkx==2.8.7 numpy==1.23.3 pandas==1.5.0 -pysam==0.19.1 +pysam==0.23.3 pyyaml==6.0.1 scipy==1.11.1 snakemake==6.15.5 From b4e64f1e696a07bf1f13d9b764f6c45e8ac0ba0b Mon Sep 17 00:00:00 2001 From: Gemy Kaithakottil Date: Wed, 3 Dec 2025 14:23:36 +0000 Subject: [PATCH 08/10] feat(ci): update gh actions codeql-analysis.yml --- .github/workflows/codeql-analysis.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index 46acc186..2527fcbf 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -39,7 +39,7 @@ jobs: # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL - uses: github/codeql-action/init@v1 + uses: github/codeql-action/init@v2 with: languages: ${{ matrix.language }} # If you wish to specify custom queries, you can do so here or in a config file. @@ -50,7 +50,7 @@ jobs: # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). # If this step fails, then you should remove it and run the build manually (see below) - name: Autobuild - uses: github/codeql-action/autobuild@v1 + uses: github/codeql-action/autobuild@v2 # â„šī¸ Command-line programs to run using the OS shell. # 📚 https://git.io/JvXDl @@ -64,4 +64,4 @@ jobs: # make release - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v1 + uses: github/codeql-action/analyze@v2 From 09ae6d2299f9f154df03efa103eebd983c3984de Mon Sep 17 00:00:00 2001 From: Gemy Kaithakottil Date: Wed, 3 Dec 2025 14:24:43 +0000 Subject: [PATCH 09/10] feat: pin versions in pyproject.toml --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 2fbd8447..119b5a3b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["setuptools","wheel","cython","numpy","scipy"] +requires = ["setuptools==65.4.1","wheel==0.37.1","Cython>=0.29.32","numpy==1.23.3","scipy==1.11.1"] build-backend = "setuptools.build_meta" [pytest] From 01ea8406a2cdefb295f134a03d1c401968431807 Mon Sep 17 00:00:00 2001 From: Gemy Kaithakottil Date: Wed, 3 Dec 2025 14:28:41 +0000 Subject: [PATCH 10/10] feat(ci): start using mamba-org/setup-micromamba instead of conda-incubator/setup-miniconda feat(ci): update channel_alias to Pixi: Package Management - https://repo.prefix.dev feat(ci): update actions to latest versions --- .github/workflows/python-package.yml | 49 ++++++++++++++-------------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 13dce218..56143b19 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -11,8 +11,7 @@ jobs: shell: bash -el {0} strategy: matrix: - # python-version: [ "3.8", "3.9" ] - python-version: [ "3.9" ] + python-version: [ "3.9", "3.10" ] # os: [ubuntu-latest, macos-latest] os: [ubuntu-latest] steps: @@ -22,7 +21,11 @@ jobs: export DEBIAN_FRONTEND=noninteractive sudo apt update sudo apt install -y build-essential zlib1g-dev zlib1g - - uses: actions/checkout@v2 + - uses: actions/checkout@v5 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v6 + with: + python-version: ${{ matrix.python-version }} - uses: actions/cache@v4 if: startsWith(runner.os, 'Linux') with: @@ -46,37 +49,33 @@ jobs: path: ~/conda_pkgs_dir key: ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{ hashFiles('./environment.yml') }} - - uses: actions/checkout@v4 - - uses: conda-incubator/setup-miniconda@v2 - name: setup-Mambaforge + - uses: mamba-org/setup-micromamba@v2 + name: setup-Micromamba with: - python-version: ${{ matrix.python-version }} - miniforge-variant: Mambaforge - miniforge-version: latest - # mamba-version: "*" - # channels: conda-forge, defaults - channels: conda-forge, bioconda, defaults, anaconda - channel-priority: true - activate-environment: "mikado2" environment-file: ./environment.yml - use-mamba: true - # use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! - - name: Verify conda environment + environment-name: mikado2 + cache-environment: true + condarc: | + channels: + - conda-forge + - bioconda + channel_priority: flexible + channel_alias: https://repo.prefix.dev + - name: Verify micromamba environment run: | - conda info --envs - conda env list - conda activate mikado2 - conda list + micromamba info + micromamba activate mikado2 + micromamba list - name: Install dependencies run: | - conda activate mikado2 + micromamba activate mikado2 python --version gcc --version pip --version + pip install numpy==1.23.3 cython==0.29.32 pytest-cov pip install -r requirements.txt python -c "import pysam; print(pysam.__version__)" - pip install Cython pytest-cov - python setup.py develop + pip install --no-deps --editable . - name: Test light run: | pytest -m slow Mikado/tests/test_light.py::LightTest::test_subprocess_multi_empty_orfs @@ -90,7 +89,7 @@ jobs: pytest -m 'not triage'; - name: Upload coverage to Codecov if: startsWith(runner.os, 'Linux') - uses: codecov/codecov-action@v1 + uses: codecov/codecov-action@v3 # - name: Test daijin # if: startsWith(runner.os, 'Linux') # run: |