From 1af7809f2c898bba9730b4bed9fb17bb450a6c7e Mon Sep 17 00:00:00 2001 From: earx Date: Thu, 19 Jun 2025 17:13:54 +0200 Subject: [PATCH 1/4] fix new matrices --- bin/stats/pluviometer.py | 23 +++++++++++++++++------ bin/stats/site_variant_readers.py | 4 ++-- bin/stats/utils.py | 12 ++++++++++++ 3 files changed, 31 insertions(+), 8 deletions(-) diff --git a/bin/stats/pluviometer.py b/bin/stats/pluviometer.py index dbb15e9..5719a14 100644 --- a/bin/stats/pluviometer.py +++ b/bin/stats/pluviometer.py @@ -88,10 +88,13 @@ def parse_cli_input() -> argparse.Namespace: def write_output_file_header(handle: TextIO) -> int: return handle.write( "SeqID\tGrandParentID\tParentID\tFeatureID\tType\tStart\tEnd\tStrand\tCoveredSites" - + "\tSiteFreqs[" + + "\tGenomeBases[" + + ",".join(BASE_TYPES) + + "]" + + "\tSiteBasePairs[" + ",".join(MATCH_MISMATCH_TYPES) + "]" - + "\tReadFreqs[" + + "\tReadBasePairs[" + ",".join(MATCH_MISMATCH_TYPES) + "]" + "\n" @@ -102,7 +105,7 @@ class SiteFilter: def __init__(self, cov_threshold: int, edit_threshold: int) -> None: self.cov_threshold: int = cov_threshold self.edit_threshold: int = edit_threshold - self.frequencies: NDArray[np.int32] = np.zeros(4, np.int32) + self.frequencies: NDArray[np.int32] = np.zeros(5, np.int32) def apply(self, variant_data: SiteVariantData) -> None: if variant_data.coverage >= self.cov_threshold: @@ -127,9 +130,11 @@ def __init__(self, site_filter: SiteFilter) -> None: Rows and column indices correspond to bases in alphabetic order (ACGT) Row-columns corresponding to the same base (e.g. (0,0) -> (A,A)) do not represent edits, and should remain 0 """ - self.edit_read_freqs: NDArray[np.int32] = np.zeros((4, 4), dtype=np.int32) + self.edit_read_freqs: NDArray[np.int32] = np.zeros((5, 5), dtype=np.int32) + + self.edit_site_freqs: NDArray[np.int32] = np.zeros((5, 5), dtype=np.int32) - self.edit_site_freqs: NDArray[np.int32] = np.zeros((4, 4), dtype=np.int32) + self.genome_base_freqs: NDArray[np.int32] = np.zeros(5, dtype=np.int32) self.filter = site_filter @@ -144,13 +149,19 @@ def update(self, variant_data: SiteVariantData) -> None: self.filter.apply(variant_data) self.edit_site_freqs[i, :] += self.filter.frequencies + self.genome_base_freqs[i] += 1 + return None def report(self, output_handle: TextIO) -> int: b = 0 # Write the number of covered sites - b += output_handle.write(str(self.edit_site_freqs.sum())) + b += output_handle.write(str(self.genome_base_freqs.sum())) + b += output_handle.write("\t") + + # Write the base frequencies in the genome + b += write_base_array(output_handle, self.genome_base_freqs) b += output_handle.write("\t") # Write edited sites diff --git a/bin/stats/site_variant_readers.py b/bin/stats/site_variant_readers.py index 1a4f376..e0ba5ca 100644 --- a/bin/stats/site_variant_readers.py +++ b/bin/stats/site_variant_readers.py @@ -35,7 +35,7 @@ def __init__(self, strand: int, edit: str) -> None: assert edit in EDIT_TYPES + NONEDIT_TYPES self.reference: int = NUC_STR_TO_IND[edit[0]] self.edited: str = edit[1] - self.frequencies: NDArray[np.int32] = np.zeros(4, dtype=np.int32) + self.frequencies: NDArray[np.int32] = np.zeros(5, dtype=np.int32) self.frequencies[NUC_STR_TO_IND[self.edited]] = 1 return None @@ -128,7 +128,7 @@ def _parse_parts(self) -> SiteVariantData: strand=strand, coverage=int(self.parts[REDITOOLS_FIELD_INDEX["Coverage"]]), mean_quality=float(self.parts[REDITOOLS_FIELD_INDEX["MeanQ"]]), - frequencies=np.int32(self.parts[REDITOOLS_FIELD_INDEX["Frequencies"]][1:-1].split(",")) + frequencies=np.int32(self.parts[REDITOOLS_FIELD_INDEX["Frequencies"]][1:-1].split(",") + [0]) ) def read(self) -> Optional[SiteVariantData]: diff --git a/bin/stats/utils.py b/bin/stats/utils.py index d76bc47..f4e824f 100644 --- a/bin/stats/utils.py +++ b/bin/stats/utils.py @@ -2,6 +2,7 @@ from numpy.typing import NDArray from dataclasses import dataclass import itertools +from itertools import permutations BASE_TYPES = ["A", "C", "G", "T"] @@ -37,6 +38,17 @@ def argmax(collection) -> int: """ return max(range(len(collection)), key=lambda i: collection[i]) +nucs = ('A', 'C', 'G', 'T') + +def make_variant_header_dict(): + nuc_pairs = permutations(nucs, 2) + keys = ((x, NUC_STR_TO_IND[y]) for x, y in nuc_pairs) + return {k:v for v, k in enumerate(keys)} + +header_dict = make_variant_header_dict() + +for item in header_dict.items(): + print(item) @dataclass(frozen=True, slots=True) class SiteVariantData: From b642fd985379a8cb619ce75342fd77dde11de51d Mon Sep 17 00:00:00 2001 From: earx Date: Thu, 19 Jun 2025 17:14:13 +0200 Subject: [PATCH 2/4] add new result directory to gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index feeded4..42ce46c 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ result/* **/__pycache__/ .DS_Store sif_images/* +rain_result/* From 39400173cb6235404e08aebe00855e3c67489a41 Mon Sep 17 00:00:00 2001 From: earx Date: Thu, 19 Jun 2025 17:52:39 +0200 Subject: [PATCH 3/4] fix strandedness check --- modules/reditools2.nf | 9 +++++---- modules/reditools3.nf | 3 ++- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/modules/reditools2.nf b/modules/reditools2.nf index 2ed7f13..6e45976 100644 --- a/modules/reditools2.nf +++ b/modules/reditools2.nf @@ -15,19 +15,20 @@ process reditools2 { // Set the strand orientation parameter from the library type parameter // Terms explained in https://salmon.readthedocs.io/en/latest/library_type.html - if (meta.libtype in ["ISR", "SR"]) { + if (meta.strandedness in ["ISR", "SR"]) { // First-strand oriented strand_orientation = "2" - } else if (meta.libtype in ["ISF", "SF"]) { + } else if (meta.strandedness in ["ISF", "SF"]) { // Second-strand oriented strand_orientation = "1" - } else if (meta.libtype in ["IU", "U"]) { + } else if (meta.strandedness in ["IU", "U"]) { // Unstranded strand_orientation = "0" } else { // Unsupported: Pass the library type string so that it's reported in // the reditools error message - strand_orientation = meta.libtype + print("invalid strand \"${meta.strandedness}\"") + strand_orientation = meta.strandedness } base_name = bam.BaseName diff --git a/modules/reditools3.nf b/modules/reditools3.nf index f833047..cf27868 100644 --- a/modules/reditools3.nf +++ b/modules/reditools3.nf @@ -26,7 +26,8 @@ process reditools3 { } else { // Unsupported: Pass the library type string so that it's reported in // the reditools error message - strand_orientation = "0" + print("invalid strand ${meta.strandedness}") + strand_orientation = meta.strandedness } base_name = bam.BaseName From 86a26573602e9af0c88ffe7e66753837fc58c3e9 Mon Sep 17 00:00:00 2001 From: earx Date: Thu, 19 Jun 2025 17:55:40 +0200 Subject: [PATCH 4/4] conditionally activate fastqc_clip --- rain.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rain.nf b/rain.nf index 9366569..278b928 100644 --- a/rain.nf +++ b/rain.nf @@ -574,12 +574,12 @@ workflow { if (params.clipoverlap) { bamutil_clipoverlap(gatk_markduplicates.out.tuple_sample_dedupbam) tuple_sample_bam_processed = bamutil_clipoverlap.out.tuple_sample_clipoverbam + // stat on bam with overlap clipped + fastqc_clip(tuple_sample_bam_processed, "clip") + logs.concat(fastqc_clip.out).set{logs} // save log } else { tuple_sample_bam_processed = gatk_markduplicates.out.tuple_sample_dedupbam } - // stat on bam with overlap clipped - fastqc_clip(tuple_sample_bam_processed, "clip") - logs.concat(fastqc_clip.out).set{logs} // save log // index bam samtools_index(tuple_sample_bam_processed) // report with multiqc