Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
57 commits
Select commit Hold shift + click to select a range
1a74ad2
new pluviometer working
eascarrunz Jul 8, 2025
c0d8196
script rewrite (pluvi.py) + refactoring of other files
eascarrunz Jul 8, 2025
4054787
fix progress bar option
eascarrunz Jul 8, 2025
f167d6f
replace old pluviometer
eascarrunz Jul 10, 2025
f858ec3
add shebang
eascarrunz Jul 10, 2025
4416cbe
display progress bar
eascarrunz Jul 10, 2025
b25547f
change permissions to make script executable
eascarrunz Jul 10, 2025
3ad7e59
print field with type of parent feature
eascarrunz Jul 10, 2025
185cf09
add logging for record switching
eascarrunz Jul 11, 2025
37e5bdb
set pluviometer error strategy to "terminate"
eascarrunz Jul 11, 2025
99c4d7e
fix logging that used an attribute that was not set yet
eascarrunz Jul 11, 2025
900d7df
set logging format, add logging points for GFF parsing
eascarrunz Jul 11, 2025
d16ad13
Aggregate level-1 features selecting only one representative transcri…
eascarrunz Jul 11, 2025
c8d606d
get last position from record's sequence length
eascarrunz Jul 11, 2025
d8c9109
fix last commit
eascarrunz Jul 11, 2025
1cde6b9
remove incorrect assertion in `SeqFeature.get_transcript_like`
eascarrunz Jul 11, 2025
dc0dce6
disable logging format
eascarrunz Jul 11, 2025
5671111
fix typo
eascarrunz Jul 11, 2025
b84380c
set logging file
eascarrunz Jul 11, 2025
e2ea2d1
fix logging format string
eascarrunz Jul 11, 2025
a6e5ad5
add more logging
eascarrunz Jul 11, 2025
cbc4a90
correct log messages
eascarrunz Jul 11, 2025
895ea87
complete level-1 aggregation
eascarrunz Jul 11, 2025
9272254
add missing comma in aggregate output field list
eascarrunz Jul 11, 2025
792dc3b
fix output paths
eascarrunz Jul 11, 2025
ea91811
method for flushing action queue
eascarrunz Jul 11, 2025
42a2b4b
make pluviometer.log the base log file name
eascarrunz Jul 11, 2025
e681a21
add log file to process outputs
eascarrunz Jul 11, 2025
6904251
Stop counting when svdata.seqid != self.record.id, and retain the svd…
eascarrunz Jul 11, 2025
d6272ca
fix log file extension (.tsv -> .log)
eascarrunz Jul 11, 2025
5f4c2fc
initialize svdata (as None)
eascarrunz Jul 11, 2025
f0b7b54
make readers seek the next entry matching the current record
eascarrunz Jul 15, 2025
41f6dfb
open input file for each record
eascarrunz Jul 15, 2025
d2f7bc1
log hits in seek_record methods
eascarrunz Jul 15, 2025
e8f66cc
more logging
eascarrunz Jul 15, 2025
3f07287
increase waiting time and memory for pluviometer.py
eascarrunz Jul 15, 2025
f5d6924
create chimaeric transcripts
eascarrunz Jul 16, 2025
d04f3a0
fix typing after name change
eascarrunz Jul 16, 2025
5285064
fix missing output field from method `write_row_without_data`
eascarrunz Jul 16, 2025
324a6b4
add parent path to output files
eascarrunz Jul 16, 2025
9e5dea8
remove input format comment from output
eascarrunz Jul 16, 2025
1a8449b
old code cleanup
eascarrunz Jul 16, 2025
02fecb7
add chromosome and total aggregation tallies
eascarrunz Jul 17, 2025
99123a4
Perish the aggregation "mode"
eascarrunz Jul 17, 2025
c3ff345
write chromosome totals
eascarrunz Jul 17, 2025
d91b2c6
use np.int64 to avoid overflows when computing totals
eascarrunz Jul 17, 2025
6e2778a
code cleanup
eascarrunz Jul 17, 2025
3814709
log only one message in the `make_chimaera` method
eascarrunz Jul 17, 2025
7392d7b
remove unnecessary log message
eascarrunz Jul 17, 2025
078319e
delete vestige of `mode` parameter
eascarrunz Jul 17, 2025
f3ed8f0
bug fix: representative transcript selection was checking for type "c…
eascarrunz Jul 17, 2025
c6acc9e
improve logging message
eascarrunz Jul 17, 2025
ffb6a04
parallelization
eascarrunz Jul 18, 2025
936deeb
set up threads parameter in nf module
eascarrunz Jul 18, 2025
c04a55f
update pluviometer conda env to add natsort
eascarrunz Jul 18, 2025
424090a
typo in cpus param
eascarrunz Jul 18, 2025
7def980
remove delete before create
eascarrunz Jul 18, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
193 changes: 193 additions & 0 deletions bin/FeatureOutputWriter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
from MultiCounter import MultiCounter
from Bio.SeqFeature import SeqFeature
from Bio.SeqFeature import ExactPosition
from collections import defaultdict
from typing import TextIO
from utils import BASE_TYPES, MATCH_MISMATCH_TYPES

FEATURE_OUTPUT_FIELDS = [
"SeqID",
"Parents",
"FeatureID",
"Type",
"Start",
"End",
"Strand",
"CoveredSites",
f"GenomeBases[{','.join(BASE_TYPES)}]",
f"SiteBasePairs[{','.join(MATCH_MISMATCH_TYPES)}]",
f"ReadBasePairs[{','.join(MATCH_MISMATCH_TYPES)}]",
]

FEATURE_METADATA_OUTPUT_FIELDS = [
"SeqID",
"ParentsIDs",
"FeatureID",
"Type",
"Start",
"End",
"Strand",
]

FEATURE_DATA_OUTPUT_FIELDS = [
"CoveredSites",
f"GenomeBases[{','.join(BASE_TYPES)}]",
f"SiteBasePairs[{','.join(MATCH_MISMATCH_TYPES)}]",
f"ReadBasePairs[{','.join(MATCH_MISMATCH_TYPES)}]",
]

AGGREGATE_METADATA_OUTPUT_FIELDS = [
"SeqID",
"ParentsIDs",
"FeatureID",
"ParentType",
"AggregateType",
]

AGGREGATE_DATA_OUTPUT_FIELDS = [
"CoveredSites",
f"GenomeBases[{','.join(BASE_TYPES)}]",
f"SiteBasePairs[{','.join(MATCH_MISMATCH_TYPES)}]",
f"ReadBasePairs[{','.join(MATCH_MISMATCH_TYPES)}]",
]

STR_ZERO_BASE_FREQS = ",".join('0' for _ in range(len(BASE_TYPES)))
STR_ZERO_EDIT_FREQS = ",".join('0' for _ in range(len(MATCH_MISMATCH_TYPES)))


def make_parent_path(parent_list: list[str]) -> str:
"""
Create a path string from an ordered list of parent IDs.
The separator is a comma, chosen because it is one of the few invalid characters in tag=value entries of the attributes field in the GFF3 format.

Consult the GFF3 specification for details: https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md
"""
return ','.join(parent_list)


class RainFileWriter:
def __init__(
self, handle: TextIO, metadata_fields: list[str], data_fields: list[str]
):
self.handle = handle
self.metadata_fields: list[str] = metadata_fields
self.n_metadata: int = len(self.metadata_fields)
self.data_fields: list[str] = data_fields
self.n_data: int = len(self.data_fields)

return None

def write_header(self) -> int:
b: int = self.handle.write("\t".join(self.metadata_fields))
b += self.handle.write("\t")
b += self.handle.write("\t".join(self.data_fields))
b += self.handle.write("\n")

return b

def write_comment(self, comment: str) -> int:
b = self.handle.write("# ")
b += self.handle.write(comment)
b += self.handle.write("\n")

return b

def write_metadata(self, *metadata_values) -> int:
b: int = 0
for val in metadata_values:
b += self.handle.write(val)
b += self.handle.write("\t")

return b

def write_data(self, *data_values) -> int:
b: int = 0
for val in data_values[:-1]:
b += self.handle.write(val)
b += self.handle.write("\t")

b += self.handle.write(data_values[-1])
b += self.handle.write("\n")

return b


class FeatureFileWriter(RainFileWriter):
def __init__(self, handle: TextIO):
super().__init__(
handle, FEATURE_METADATA_OUTPUT_FIELDS, FEATURE_DATA_OUTPUT_FIELDS
)

return None

def write_metadata(self, record_id: str, feature: SeqFeature) -> int:
return super().write_metadata(
record_id,
make_parent_path(feature.parent_list),
feature.id,
feature.type,
str(feature.location.parts[0].start + ExactPosition(1)),
str(feature.location.parts[-1].end),
str(feature.location.strand),
)

def write_row_with_data(
self, record_id: str, feature: SeqFeature, counter: MultiCounter
) -> int:
return self.write_metadata(record_id, feature) + self.write_data(
str(counter.genome_base_freqs.sum()),
",".join(map(str, counter.genome_base_freqs.flat)),
",".join(map(str, counter.edit_site_freqs.flat)),
",".join(map(str, counter.edit_read_freqs.flat)),
)

def write_row_without_data(self, record_id: str, feature: SeqFeature) -> int:
return self.write_metadata(record_id, feature) + self.write_data(
'0', STR_ZERO_BASE_FREQS, STR_ZERO_EDIT_FREQS, STR_ZERO_EDIT_FREQS
)

class AggregateFileWriter(RainFileWriter):
def __init__(self, handle: TextIO):
super().__init__(
handle, AGGREGATE_METADATA_OUTPUT_FIELDS, AGGREGATE_DATA_OUTPUT_FIELDS
)

return None

def write_metadata(self, seq_id: str, feature: SeqFeature, aggregate_type: str) -> int:
return super().write_metadata(seq_id, make_parent_path(feature.parent_list), feature.id, feature.type, aggregate_type)

def write_rows_with_feature_and_data(self, record_id: str, feature: SeqFeature, counter_dict: defaultdict[str,MultiCounter]) -> int:
b: int = 0

for aggregate_type, aggregate_counter in counter_dict.items():
b += self.write_metadata(record_id, feature, aggregate_type)
b += self.write_data(
str(aggregate_counter.genome_base_freqs.sum()),
",".join(map(str, aggregate_counter.genome_base_freqs.flat)),
",".join(map(str, aggregate_counter.edit_site_freqs.flat)),
",".join(map(str, aggregate_counter.edit_read_freqs.flat)),
)

return b

def write_rows_with_data(
self,
record_id: str,
parent_list: list[str],
feature_id: str,
feature_type: str,
counter_dict: defaultdict[str,MultiCounter]
) -> int:
b: int = 0

for aggregate_type, aggregate_counter in counter_dict.items():
b += super().write_metadata(record_id, make_parent_path(parent_list), feature_id, feature_type, aggregate_type)
b += self.write_data(
str(aggregate_counter.genome_base_freqs.sum()),
",".join(map(str, aggregate_counter.genome_base_freqs.flat)),
",".join(map(str, aggregate_counter.edit_site_freqs.flat)),
",".join(map(str, aggregate_counter.edit_read_freqs.flat)),
)

return b
67 changes: 67 additions & 0 deletions bin/MultiCounter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from utils import SiteVariantData
import numpy as np
from numpy.typing import NDArray
from SiteFilter import SiteFilter
from typing import TextIO

class MultiCounter:
"""Holds the counter data and logic for a feature, feature aggregate, or record"""

def __init__(self, site_filter: SiteFilter) -> None:
"""
Tallies of the numbers of reads per edit type
This is a numpy matrix where the rows represent the reference base and the columns the edited base
Rows and column indices correspond to bases in alphabetic order (ACGT)
Row-columns corresponding to the same base (e.g. (0,0) -> (A,A)) do not represent edits, and should remain 0
"""
self.edit_read_freqs: NDArray[np.int64] = np.zeros((5, 5), dtype=np.int64)
self.edit_site_freqs: NDArray[np.int64] = np.zeros((5, 5), dtype=np.int64)

self.genome_base_freqs: NDArray[np.int64] = np.zeros(5, dtype=np.int64)

self.filter = site_filter

return None

def update(self, variant_data: SiteVariantData) -> None:
"""Increment the counters from the data in a SiteVariantData object."""
i: int = variant_data.reference

self.edit_read_freqs[i, :] += variant_data.frequencies

self.filter.apply(variant_data)
self.edit_site_freqs[i, :] += self.filter.frequencies

self.genome_base_freqs[i] += 1

return None

def merge(self, other_counter: "MultiCounter") -> None:
"""
Add to this counter the values of another.
"""
self.edit_read_freqs[:] += other_counter.edit_read_freqs
self.edit_site_freqs[:] += other_counter.edit_site_freqs
self.genome_base_freqs[:] += other_counter.genome_base_freqs

return None

def report(self, output_handle: TextIO) -> int:
b = 0

# Write the number of covered sites
b += output_handle.write(str(self.genome_base_freqs.sum()))
b += output_handle.write("\t")

# Write the base frequencies in the genome
b += write_base_array(output_handle, self.genome_base_freqs)
b += output_handle.write("\t")

# Write edited sites
b += write_edit_array(output_handle, self.edit_site_freqs)
b += output_handle.write("\t")

# Write edit frequencies
b += write_edit_array(output_handle, self.edit_read_freqs)

return b
104 changes: 104 additions & 0 deletions bin/SeqFeature_extensions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# Custom methods and attributes for Bio.SeqFeature
from Bio.SeqFeature import SeqFeature, SimpleLocation, CompoundLocation
from utils import location_union
from typing import Optional
import logging

logger = logging.getLogger(__name__)

setattr(SeqFeature, "level", 0)


def get_transcript_like(self: SeqFeature) -> list[tuple[str, str, int]]:
"""
Return a list with information about sub-features that are transcript-like (i.e. their contain children of type "exon" or "CDS").

List items are tuples that contain the ID of the transcript-like feature, the type of the transcript-like feature, and the total exon or CDS of the transcript-like feature.
"""
transcript_like_list: list[tuple[str, str, int]] = []
for transcript_candidate in self.sub_features:
total_exon_length: int = 0
total_cds_length: int = 0
for child in transcript_candidate.sub_features:
if child.type == "exon":
total_exon_length += len(child)
elif child.type == "CDS":
total_cds_length += len(child)

if total_cds_length > 0:
transcript_like_list.append(
(transcript_candidate.id, "CDS", total_cds_length)
)
elif total_exon_length > 0:
transcript_like_list.append(
(transcript_candidate.id, "exon", total_exon_length)
)

return transcript_like_list


setattr(SeqFeature, "get_transcript_like", get_transcript_like)

setattr(SeqFeature, "parent_list", [""])

def make_chimaera(self: SeqFeature) -> None:
"""
If the feature contains
"""
if hasattr(self, "sub_features"):
if len(self.sub_features) == 0:
return None
else:
return None

transcript_like_list: list[SeqFeature] = list(
filter(
lambda transcript: any(map(lambda part: part.type == "CDS", transcript.sub_features)),
self.sub_features,
)
)

if len(transcript_like_list) == 0:
chimaeric_type: str = "exon"
transcript_like_list: list[SeqFeature] = list(
filter(
lambda transcript: any(map(lambda part: part.type == "exon", transcript.sub_features)),
self.sub_features,
)
)
else:
chimaeric_type: str = "CDS"

if len(transcript_like_list) == 0:
return None


target_locations: list[SimpleLocation | CompoundLocation] = []
for transcript in transcript_like_list:
target_locations.extend(
list(map(
lambda part: part.location,
filter(lambda part: part.type == chimaeric_type, transcript.sub_features),
))
)

chimaeric_location: SimpleLocation | CompoundLocation = location_union(
target_locations
)
logging.info(f"Created {chimaeric_type} chimaera of feature {self.id}: {len(transcript_like_list)} transcripts were merged into one transcript of {len(chimaeric_location.parts)} elements")

chimaeric_feature: SeqFeature = SeqFeature(
location=chimaeric_location,
type=chimaeric_type + "-chimaera",
id=self.id + "-chimaera",
qualifiers={"Parent": self.id},
)

chimaeric_feature.sub_features = []

self.sub_features.append(chimaeric_feature)

return None


setattr(SeqFeature, "make_chimaera", make_chimaera)
21 changes: 21 additions & 0 deletions bin/SiteFilter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from utils import SiteVariantData
import numpy as np
from numpy.typing import NDArray

class SiteFilter:
def __init__(self, cov_threshold: int, edit_threshold: int) -> None:
self.cov_threshold: int = cov_threshold
self.edit_threshold: int = edit_threshold
self.frequencies: NDArray[np.int32] = np.zeros(5, np.int32)

def apply(self, variant_data: SiteVariantData) -> None:
if variant_data.coverage >= self.cov_threshold:
np.copyto(
self.frequencies,
variant_data.frequencies * variant_data.frequencies
>= self.edit_threshold,
)
else:
self.frequencies.fill(0)

return None
Loading