-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsequence_utils.py
More file actions
90 lines (70 loc) · 2.64 KB
/
sequence_utils.py
File metadata and controls
90 lines (70 loc) · 2.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#!/usr/bin/env python3
"""
sequence_utils.py — Shared DNA/FASTA sequence utilities for DGD
================================================================
Author : Vipin Menon, BIG Lab, Hanyang University
Date : August 2021 (modernized 2026)
Provides:
- reverse_complement() — reverse-complement a DNA string
- parse_fasta() — lazy FASTA file parser
"""
import logging
from typing import Iterator, Optional, Tuple
logger = logging.getLogger(__name__)
# Precomputed DNA complement translation table (built once at import time)
_COMPLEMENT_TABLE = str.maketrans("ATCG", "TAGC")
def reverse_complement(sequence: str) -> str:
"""
Return the reverse complement of a DNA sequence.
Args:
sequence: A DNA string containing only A, T, C, G characters (uppercase).
Returns:
The reverse complement as an uppercase string.
Example:
>>> reverse_complement("ATCG")
'CGAT'
"""
return sequence[::-1].translate(_COMPLEMENT_TABLE)
def parse_fasta(filepath: str) -> Iterator[Tuple[str, str]]:
"""
Parse a FASTA file and yield (sequence_id, sequence) pairs.
Multi-line sequences are concatenated. Sequences are uppercased and
whitespace-stripped. Empty lines are ignored.
Args:
filepath: Path to a FASTA-formatted file (.fa, .fasta, .fna).
Yields:
Tuples of (sequence_id, full_sequence).
Raises:
FileNotFoundError : If the file does not exist.
ValueError : If the file contains no valid FASTA records.
Example:
>>> for seq_id, sequence in parse_fasta("input.fa"):
... print(seq_id, len(sequence))
"""
import os
if not os.path.isfile(filepath):
raise FileNotFoundError(f"FASTA file not found: '{filepath}'")
current_id: Optional[str] = None
parts: list = []
found_any = False
with open(filepath, "r", encoding="utf-8") as fh:
for line in fh:
line = line.strip()
if not line:
continue
if line.startswith(">"):
# Flush previous record before starting a new one
if current_id is not None:
yield current_id, "".join(parts).upper()
found_any = True
# Take only the first word as the sequence ID
current_id = line[1:].split()[0]
parts = []
else:
parts.append(line)
# Flush the final record
if current_id is not None:
yield current_id, "".join(parts).upper()
found_any = True
if not found_any:
raise ValueError(f"No valid FASTA records found in '{filepath}'")