DGD-Cas9/sequence_utils.py at main · GuideDesigner/DGD-Cas9 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#!/usr/bin/env python3
"""
sequence_utils.py — Shared DNA/FASTA sequence utilities for DGD
================================================================
Author : Vipin Menon, BIG Lab, Hanyang University
Date   : August 2021 (modernized 2026)

Provides:
  - reverse_complement()  — reverse-complement a DNA string
  - parse_fasta()         — lazy FASTA file parser
"""

import logging
from typing import Iterator, Optional, Tuple

logger = logging.getLogger(__name__)

# Precomputed DNA complement translation table (built once at import time)
_COMPLEMENT_TABLE = str.maketrans("ATCG", "TAGC")


def reverse_complement(sequence: str) -> str:
    """
    Return the reverse complement of a DNA sequence.

    Args:
        sequence: A DNA string containing only A, T, C, G characters (uppercase).

    Returns:
        The reverse complement as an uppercase string.

    Example:
        >>> reverse_complement("ATCG")
        'CGAT'
    """
    return sequence[::-1].translate(_COMPLEMENT_TABLE)


def parse_fasta(filepath: str) -> Iterator[Tuple[str, str]]:
    """
    Parse a FASTA file and yield (sequence_id, sequence) pairs.

    Multi-line sequences are concatenated. Sequences are uppercased and
    whitespace-stripped. Empty lines are ignored.

    Args:
        filepath: Path to a FASTA-formatted file (.fa, .fasta, .fna).

    Yields:
        Tuples of (sequence_id, full_sequence).

    Raises:
        FileNotFoundError : If the file does not exist.
        ValueError        : If the file contains no valid FASTA records.

    Example:
        >>> for seq_id, sequence in parse_fasta("input.fa"):
        ...     print(seq_id, len(sequence))
    """
    import os
    if not os.path.isfile(filepath):
        raise FileNotFoundError(f"FASTA file not found: '{filepath}'")

    current_id: Optional[str] = None
    parts: list = []
    found_any = False

    with open(filepath, "r", encoding="utf-8") as fh:
        for line in fh:
            line = line.strip()
            if not line:
                continue
            if line.startswith(">"):
                # Flush previous record before starting a new one
                if current_id is not None:
                    yield current_id, "".join(parts).upper()
                    found_any = True
                # Take only the first word as the sequence ID
                current_id = line[1:].split()[0]
                parts = []
            else:
                parts.append(line)

    # Flush the final record
    if current_id is not None:
        yield current_id, "".join(parts).upper()
        found_any = True

    if not found_any:
        raise ValueError(f"No valid FASTA records found in '{filepath}'")