Skip to content

Commit 47b6d77

Browse files
authored
Merge pull request #46 from Juke34/new-pluviometer
New pluviometer
2 parents da16547 + 7def980 commit 47b6d77

13 files changed

Lines changed: 1435 additions & 450 deletions

File tree

bin/FeatureOutputWriter.py

Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
from MultiCounter import MultiCounter
2+
from Bio.SeqFeature import SeqFeature
3+
from Bio.SeqFeature import ExactPosition
4+
from collections import defaultdict
5+
from typing import TextIO
6+
from utils import BASE_TYPES, MATCH_MISMATCH_TYPES
7+
8+
FEATURE_OUTPUT_FIELDS = [
9+
"SeqID",
10+
"Parents",
11+
"FeatureID",
12+
"Type",
13+
"Start",
14+
"End",
15+
"Strand",
16+
"CoveredSites",
17+
f"GenomeBases[{','.join(BASE_TYPES)}]",
18+
f"SiteBasePairs[{','.join(MATCH_MISMATCH_TYPES)}]",
19+
f"ReadBasePairs[{','.join(MATCH_MISMATCH_TYPES)}]",
20+
]
21+
22+
FEATURE_METADATA_OUTPUT_FIELDS = [
23+
"SeqID",
24+
"ParentsIDs",
25+
"FeatureID",
26+
"Type",
27+
"Start",
28+
"End",
29+
"Strand",
30+
]
31+
32+
FEATURE_DATA_OUTPUT_FIELDS = [
33+
"CoveredSites",
34+
f"GenomeBases[{','.join(BASE_TYPES)}]",
35+
f"SiteBasePairs[{','.join(MATCH_MISMATCH_TYPES)}]",
36+
f"ReadBasePairs[{','.join(MATCH_MISMATCH_TYPES)}]",
37+
]
38+
39+
AGGREGATE_METADATA_OUTPUT_FIELDS = [
40+
"SeqID",
41+
"ParentsIDs",
42+
"FeatureID",
43+
"ParentType",
44+
"AggregateType",
45+
]
46+
47+
AGGREGATE_DATA_OUTPUT_FIELDS = [
48+
"CoveredSites",
49+
f"GenomeBases[{','.join(BASE_TYPES)}]",
50+
f"SiteBasePairs[{','.join(MATCH_MISMATCH_TYPES)}]",
51+
f"ReadBasePairs[{','.join(MATCH_MISMATCH_TYPES)}]",
52+
]
53+
54+
STR_ZERO_BASE_FREQS = ",".join('0' for _ in range(len(BASE_TYPES)))
55+
STR_ZERO_EDIT_FREQS = ",".join('0' for _ in range(len(MATCH_MISMATCH_TYPES)))
56+
57+
58+
def make_parent_path(parent_list: list[str]) -> str:
59+
"""
60+
Create a path string from an ordered list of parent IDs.
61+
The separator is a comma, chosen because it is one of the few invalid characters in tag=value entries of the attributes field in the GFF3 format.
62+
63+
Consult the GFF3 specification for details: https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md
64+
"""
65+
return ','.join(parent_list)
66+
67+
68+
class RainFileWriter:
69+
def __init__(
70+
self, handle: TextIO, metadata_fields: list[str], data_fields: list[str]
71+
):
72+
self.handle = handle
73+
self.metadata_fields: list[str] = metadata_fields
74+
self.n_metadata: int = len(self.metadata_fields)
75+
self.data_fields: list[str] = data_fields
76+
self.n_data: int = len(self.data_fields)
77+
78+
return None
79+
80+
def write_header(self) -> int:
81+
b: int = self.handle.write("\t".join(self.metadata_fields))
82+
b += self.handle.write("\t")
83+
b += self.handle.write("\t".join(self.data_fields))
84+
b += self.handle.write("\n")
85+
86+
return b
87+
88+
def write_comment(self, comment: str) -> int:
89+
b = self.handle.write("# ")
90+
b += self.handle.write(comment)
91+
b += self.handle.write("\n")
92+
93+
return b
94+
95+
def write_metadata(self, *metadata_values) -> int:
96+
b: int = 0
97+
for val in metadata_values:
98+
b += self.handle.write(val)
99+
b += self.handle.write("\t")
100+
101+
return b
102+
103+
def write_data(self, *data_values) -> int:
104+
b: int = 0
105+
for val in data_values[:-1]:
106+
b += self.handle.write(val)
107+
b += self.handle.write("\t")
108+
109+
b += self.handle.write(data_values[-1])
110+
b += self.handle.write("\n")
111+
112+
return b
113+
114+
115+
class FeatureFileWriter(RainFileWriter):
116+
def __init__(self, handle: TextIO):
117+
super().__init__(
118+
handle, FEATURE_METADATA_OUTPUT_FIELDS, FEATURE_DATA_OUTPUT_FIELDS
119+
)
120+
121+
return None
122+
123+
def write_metadata(self, record_id: str, feature: SeqFeature) -> int:
124+
return super().write_metadata(
125+
record_id,
126+
make_parent_path(feature.parent_list),
127+
feature.id,
128+
feature.type,
129+
str(feature.location.parts[0].start + ExactPosition(1)),
130+
str(feature.location.parts[-1].end),
131+
str(feature.location.strand),
132+
)
133+
134+
def write_row_with_data(
135+
self, record_id: str, feature: SeqFeature, counter: MultiCounter
136+
) -> int:
137+
return self.write_metadata(record_id, feature) + self.write_data(
138+
str(counter.genome_base_freqs.sum()),
139+
",".join(map(str, counter.genome_base_freqs.flat)),
140+
",".join(map(str, counter.edit_site_freqs.flat)),
141+
",".join(map(str, counter.edit_read_freqs.flat)),
142+
)
143+
144+
def write_row_without_data(self, record_id: str, feature: SeqFeature) -> int:
145+
return self.write_metadata(record_id, feature) + self.write_data(
146+
'0', STR_ZERO_BASE_FREQS, STR_ZERO_EDIT_FREQS, STR_ZERO_EDIT_FREQS
147+
)
148+
149+
class AggregateFileWriter(RainFileWriter):
150+
def __init__(self, handle: TextIO):
151+
super().__init__(
152+
handle, AGGREGATE_METADATA_OUTPUT_FIELDS, AGGREGATE_DATA_OUTPUT_FIELDS
153+
)
154+
155+
return None
156+
157+
def write_metadata(self, seq_id: str, feature: SeqFeature, aggregate_type: str) -> int:
158+
return super().write_metadata(seq_id, make_parent_path(feature.parent_list), feature.id, feature.type, aggregate_type)
159+
160+
def write_rows_with_feature_and_data(self, record_id: str, feature: SeqFeature, counter_dict: defaultdict[str,MultiCounter]) -> int:
161+
b: int = 0
162+
163+
for aggregate_type, aggregate_counter in counter_dict.items():
164+
b += self.write_metadata(record_id, feature, aggregate_type)
165+
b += self.write_data(
166+
str(aggregate_counter.genome_base_freqs.sum()),
167+
",".join(map(str, aggregate_counter.genome_base_freqs.flat)),
168+
",".join(map(str, aggregate_counter.edit_site_freqs.flat)),
169+
",".join(map(str, aggregate_counter.edit_read_freqs.flat)),
170+
)
171+
172+
return b
173+
174+
def write_rows_with_data(
175+
self,
176+
record_id: str,
177+
parent_list: list[str],
178+
feature_id: str,
179+
feature_type: str,
180+
counter_dict: defaultdict[str,MultiCounter]
181+
) -> int:
182+
b: int = 0
183+
184+
for aggregate_type, aggregate_counter in counter_dict.items():
185+
b += super().write_metadata(record_id, make_parent_path(parent_list), feature_id, feature_type, aggregate_type)
186+
b += self.write_data(
187+
str(aggregate_counter.genome_base_freqs.sum()),
188+
",".join(map(str, aggregate_counter.genome_base_freqs.flat)),
189+
",".join(map(str, aggregate_counter.edit_site_freqs.flat)),
190+
",".join(map(str, aggregate_counter.edit_read_freqs.flat)),
191+
)
192+
193+
return b

bin/MultiCounter.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
from utils import SiteVariantData
2+
import numpy as np
3+
from numpy.typing import NDArray
4+
from SiteFilter import SiteFilter
5+
from typing import TextIO
6+
7+
class MultiCounter:
8+
"""Holds the counter data and logic for a feature, feature aggregate, or record"""
9+
10+
def __init__(self, site_filter: SiteFilter) -> None:
11+
"""
12+
Tallies of the numbers of reads per edit type
13+
This is a numpy matrix where the rows represent the reference base and the columns the edited base
14+
Rows and column indices correspond to bases in alphabetic order (ACGT)
15+
Row-columns corresponding to the same base (e.g. (0,0) -> (A,A)) do not represent edits, and should remain 0
16+
"""
17+
self.edit_read_freqs: NDArray[np.int64] = np.zeros((5, 5), dtype=np.int64)
18+
self.edit_site_freqs: NDArray[np.int64] = np.zeros((5, 5), dtype=np.int64)
19+
20+
self.genome_base_freqs: NDArray[np.int64] = np.zeros(5, dtype=np.int64)
21+
22+
self.filter = site_filter
23+
24+
return None
25+
26+
def update(self, variant_data: SiteVariantData) -> None:
27+
"""Increment the counters from the data in a SiteVariantData object."""
28+
i: int = variant_data.reference
29+
30+
self.edit_read_freqs[i, :] += variant_data.frequencies
31+
32+
self.filter.apply(variant_data)
33+
self.edit_site_freqs[i, :] += self.filter.frequencies
34+
35+
self.genome_base_freqs[i] += 1
36+
37+
return None
38+
39+
def merge(self, other_counter: "MultiCounter") -> None:
40+
"""
41+
Add to this counter the values of another.
42+
"""
43+
self.edit_read_freqs[:] += other_counter.edit_read_freqs
44+
self.edit_site_freqs[:] += other_counter.edit_site_freqs
45+
self.genome_base_freqs[:] += other_counter.genome_base_freqs
46+
47+
return None
48+
49+
def report(self, output_handle: TextIO) -> int:
50+
b = 0
51+
52+
# Write the number of covered sites
53+
b += output_handle.write(str(self.genome_base_freqs.sum()))
54+
b += output_handle.write("\t")
55+
56+
# Write the base frequencies in the genome
57+
b += write_base_array(output_handle, self.genome_base_freqs)
58+
b += output_handle.write("\t")
59+
60+
# Write edited sites
61+
b += write_edit_array(output_handle, self.edit_site_freqs)
62+
b += output_handle.write("\t")
63+
64+
# Write edit frequencies
65+
b += write_edit_array(output_handle, self.edit_read_freqs)
66+
67+
return b

bin/SeqFeature_extensions.py

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
# Custom methods and attributes for Bio.SeqFeature
2+
from Bio.SeqFeature import SeqFeature, SimpleLocation, CompoundLocation
3+
from utils import location_union
4+
from typing import Optional
5+
import logging
6+
7+
logger = logging.getLogger(__name__)
8+
9+
setattr(SeqFeature, "level", 0)
10+
11+
12+
def get_transcript_like(self: SeqFeature) -> list[tuple[str, str, int]]:
13+
"""
14+
Return a list with information about sub-features that are transcript-like (i.e. their contain children of type "exon" or "CDS").
15+
16+
List items are tuples that contain the ID of the transcript-like feature, the type of the transcript-like feature, and the total exon or CDS of the transcript-like feature.
17+
"""
18+
transcript_like_list: list[tuple[str, str, int]] = []
19+
for transcript_candidate in self.sub_features:
20+
total_exon_length: int = 0
21+
total_cds_length: int = 0
22+
for child in transcript_candidate.sub_features:
23+
if child.type == "exon":
24+
total_exon_length += len(child)
25+
elif child.type == "CDS":
26+
total_cds_length += len(child)
27+
28+
if total_cds_length > 0:
29+
transcript_like_list.append(
30+
(transcript_candidate.id, "CDS", total_cds_length)
31+
)
32+
elif total_exon_length > 0:
33+
transcript_like_list.append(
34+
(transcript_candidate.id, "exon", total_exon_length)
35+
)
36+
37+
return transcript_like_list
38+
39+
40+
setattr(SeqFeature, "get_transcript_like", get_transcript_like)
41+
42+
setattr(SeqFeature, "parent_list", [""])
43+
44+
def make_chimaera(self: SeqFeature) -> None:
45+
"""
46+
If the feature contains
47+
"""
48+
if hasattr(self, "sub_features"):
49+
if len(self.sub_features) == 0:
50+
return None
51+
else:
52+
return None
53+
54+
transcript_like_list: list[SeqFeature] = list(
55+
filter(
56+
lambda transcript: any(map(lambda part: part.type == "CDS", transcript.sub_features)),
57+
self.sub_features,
58+
)
59+
)
60+
61+
if len(transcript_like_list) == 0:
62+
chimaeric_type: str = "exon"
63+
transcript_like_list: list[SeqFeature] = list(
64+
filter(
65+
lambda transcript: any(map(lambda part: part.type == "exon", transcript.sub_features)),
66+
self.sub_features,
67+
)
68+
)
69+
else:
70+
chimaeric_type: str = "CDS"
71+
72+
if len(transcript_like_list) == 0:
73+
return None
74+
75+
76+
target_locations: list[SimpleLocation | CompoundLocation] = []
77+
for transcript in transcript_like_list:
78+
target_locations.extend(
79+
list(map(
80+
lambda part: part.location,
81+
filter(lambda part: part.type == chimaeric_type, transcript.sub_features),
82+
))
83+
)
84+
85+
chimaeric_location: SimpleLocation | CompoundLocation = location_union(
86+
target_locations
87+
)
88+
logging.info(f"Created {chimaeric_type} chimaera of feature {self.id}: {len(transcript_like_list)} transcripts were merged into one transcript of {len(chimaeric_location.parts)} elements")
89+
90+
chimaeric_feature: SeqFeature = SeqFeature(
91+
location=chimaeric_location,
92+
type=chimaeric_type + "-chimaera",
93+
id=self.id + "-chimaera",
94+
qualifiers={"Parent": self.id},
95+
)
96+
97+
chimaeric_feature.sub_features = []
98+
99+
self.sub_features.append(chimaeric_feature)
100+
101+
return None
102+
103+
104+
setattr(SeqFeature, "make_chimaera", make_chimaera)

bin/SiteFilter.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
from utils import SiteVariantData
2+
import numpy as np
3+
from numpy.typing import NDArray
4+
5+
class SiteFilter:
6+
def __init__(self, cov_threshold: int, edit_threshold: int) -> None:
7+
self.cov_threshold: int = cov_threshold
8+
self.edit_threshold: int = edit_threshold
9+
self.frequencies: NDArray[np.int32] = np.zeros(5, np.int32)
10+
11+
def apply(self, variant_data: SiteVariantData) -> None:
12+
if variant_data.coverage >= self.cov_threshold:
13+
np.copyto(
14+
self.frequencies,
15+
variant_data.frequencies * variant_data.frequencies
16+
>= self.edit_threshold,
17+
)
18+
else:
19+
self.frequencies.fill(0)
20+
21+
return None

0 commit comments

Comments
 (0)