Juke34 · eascarrunz · Jun 19, 2025 · Jun 19, 2025 · Jun 19, 2025 · Jun 19, 2025
diff --git a/.gitignore b/.gitignore
@@ -6,3 +6,4 @@ result/*
 **/__pycache__/
 .DS_Store
 sif_images/*
+rain_result/*
diff --git a/bin/stats/pluviometer.py b/bin/stats/pluviometer.py
@@ -88,10 +88,13 @@ def parse_cli_input() -> argparse.Namespace:
 def write_output_file_header(handle: TextIO) -> int:
     return handle.write(
         "SeqID\tGrandParentID\tParentID\tFeatureID\tType\tStart\tEnd\tStrand\tCoveredSites"
-        + "\tSiteFreqs["
+        + "\tGenomeBases["
+        + ",".join(BASE_TYPES)
+        + "]"
+        + "\tSiteBasePairs["
         + ",".join(MATCH_MISMATCH_TYPES)
         + "]"
-        + "\tReadFreqs["
+        + "\tReadBasePairs["
         + ",".join(MATCH_MISMATCH_TYPES)
         + "]"
         + "\n"
@@ -102,7 +105,7 @@ class SiteFilter:
     def __init__(self, cov_threshold: int, edit_threshold: int) -> None:
         self.cov_threshold: int = cov_threshold
         self.edit_threshold: int = edit_threshold
-        self.frequencies: NDArray[np.int32] = np.zeros(4, np.int32)
+        self.frequencies: NDArray[np.int32] = np.zeros(5, np.int32)
 
     def apply(self, variant_data: SiteVariantData) -> None:
         if variant_data.coverage >= self.cov_threshold:
@@ -127,9 +130,11 @@ def __init__(self, site_filter: SiteFilter) -> None:
         Rows and column indices correspond to bases in alphabetic order (ACGT)
         Row-columns corresponding to the same base (e.g. (0,0) -> (A,A)) do not represent edits, and should remain 0
         """
-        self.edit_read_freqs: NDArray[np.int32] = np.zeros((4, 4), dtype=np.int32)
+        self.edit_read_freqs: NDArray[np.int32] = np.zeros((5, 5), dtype=np.int32)
+
+        self.edit_site_freqs: NDArray[np.int32] = np.zeros((5, 5), dtype=np.int32)
 
-        self.edit_site_freqs: NDArray[np.int32] = np.zeros((4, 4), dtype=np.int32)
+        self.genome_base_freqs: NDArray[np.int32] = np.zeros(5, dtype=np.int32)
 
         self.filter = site_filter
 
@@ -144,13 +149,19 @@ def update(self, variant_data: SiteVariantData) -> None:
         self.filter.apply(variant_data)
         self.edit_site_freqs[i, :] += self.filter.frequencies
 
+        self.genome_base_freqs[i] += 1
+
         return None
 
     def report(self, output_handle: TextIO) -> int:
         b = 0
 
         # Write the number of covered sites
-        b += output_handle.write(str(self.edit_site_freqs.sum()))
+        b += output_handle.write(str(self.genome_base_freqs.sum()))
+        b += output_handle.write("\t")
+
+        # Write the base frequencies in the genome
+        b += write_base_array(output_handle, self.genome_base_freqs)
         b += output_handle.write("\t")
 
         # Write edited sites

diff --git a/bin/stats/site_variant_readers.py b/bin/stats/site_variant_readers.py
@@ -35,7 +35,7 @@ def __init__(self, strand: int, edit: str) -> None:
         assert edit in EDIT_TYPES + NONEDIT_TYPES
         self.reference: int = NUC_STR_TO_IND[edit[0]]
         self.edited: str = edit[1]
-        self.frequencies: NDArray[np.int32] = np.zeros(4, dtype=np.int32)
+        self.frequencies: NDArray[np.int32] = np.zeros(5, dtype=np.int32)
         self.frequencies[NUC_STR_TO_IND[self.edited]] = 1
 
         return None
@@ -128,7 +128,7 @@ def _parse_parts(self) -> SiteVariantData:
             strand=strand,
             coverage=int(self.parts[REDITOOLS_FIELD_INDEX["Coverage"]]),
             mean_quality=float(self.parts[REDITOOLS_FIELD_INDEX["MeanQ"]]),
-            frequencies=np.int32(self.parts[REDITOOLS_FIELD_INDEX["Frequencies"]][1:-1].split(","))
+            frequencies=np.int32(self.parts[REDITOOLS_FIELD_INDEX["Frequencies"]][1:-1].split(",") + [0])
         )
 
     def read(self) -> Optional[SiteVariantData]:

diff --git a/bin/stats/utils.py b/bin/stats/utils.py
@@ -2,6 +2,7 @@
 from numpy.typing import NDArray
 from dataclasses import dataclass
 import itertools
+from itertools import permutations
 
 BASE_TYPES = ["A", "C", "G", "T"]
 
@@ -37,6 +38,17 @@ def argmax(collection) -> int:
     """
     return max(range(len(collection)), key=lambda i: collection[i])
 
+nucs = ('A', 'C', 'G', 'T')
+
+def make_variant_header_dict():
+    nuc_pairs = permutations(nucs, 2)
+    keys = ((x, NUC_STR_TO_IND[y]) for x, y in nuc_pairs)
+    return {k:v for v, k in enumerate(keys)}
+
+header_dict = make_variant_header_dict()
+
+for item in header_dict.items():
+    print(item)
 
 @dataclass(frozen=True, slots=True)
 class SiteVariantData:

diff --git a/modules/reditools2.nf b/modules/reditools2.nf
@@ -15,19 +15,20 @@ process reditools2 {
 
     // Set the strand orientation parameter from the library type parameter
     // Terms explained in https://salmon.readthedocs.io/en/latest/library_type.html
-    if (meta.libtype in ["ISR", "SR"]) {
+    if (meta.strandedness in ["ISR", "SR"]) {
         // First-strand oriented
         strand_orientation = "2"
-    } else if (meta.libtype in ["ISF", "SF"]) {
+    } else if (meta.strandedness in ["ISF", "SF"]) {
         // Second-strand oriented
         strand_orientation = "1"
-    } else if (meta.libtype in ["IU", "U"]) {
+    } else if (meta.strandedness in ["IU", "U"]) {
         // Unstranded
         strand_orientation = "0"
     } else {
         // Unsupported: Pass the library type string so that it's reported in
         // the reditools error message
-        strand_orientation = meta.libtype
+        print("invalid strand \"${meta.strandedness}\"")
+        strand_orientation = meta.strandedness
     }
     base_name = bam.BaseName
 

diff --git a/modules/reditools3.nf b/modules/reditools3.nf
@@ -26,7 +26,8 @@ process reditools3 {
         } else {
             // Unsupported: Pass the library type string so that it's reported in
             // the reditools error message
-            strand_orientation = "0"
+            print("invalid strand ${meta.strandedness}")
+            strand_orientation = meta.strandedness
         }
         base_name = bam.BaseName
 

diff --git a/rain.nf b/rain.nf
@@ -574,12 +574,12 @@ workflow {
         if (params.clipoverlap) {
             bamutil_clipoverlap(gatk_markduplicates.out.tuple_sample_dedupbam)
             tuple_sample_bam_processed = bamutil_clipoverlap.out.tuple_sample_clipoverbam
+            // stat on bam with overlap clipped
+            fastqc_clip(tuple_sample_bam_processed, "clip")
+            logs.concat(fastqc_clip.out).set{logs} // save log
         } else {
             tuple_sample_bam_processed = gatk_markduplicates.out.tuple_sample_dedupbam
         }
-        // stat on bam with overlap clipped
-        fastqc_clip(tuple_sample_bam_processed, "clip")
-        logs.concat(fastqc_clip.out).set{logs} // save log
         // index bam
         samtools_index(tuple_sample_bam_processed)
         // report with multiqc