From 699b39705e9125b14a6bdd8e85dedf9d158c9794 Mon Sep 17 00:00:00 2001
From: Beibei Chen <beibei@formbio.com>
Date: Mon, 6 Oct 2025 15:19:30 -0500
Subject: [PATCH 1/8] 1 table

---
 src/aggregate_tables.py | 55 ++++++++++++++++++++++++++++++-----------
 1 file changed, 41 insertions(+), 14 deletions(-)

diff --git a/src/aggregate_tables.py b/src/aggregate_tables.py
index 6830c14..773fcf7 100755
--- a/src/aggregate_tables.py
+++ b/src/aggregate_tables.py
@@ -34,6 +34,8 @@ def analyze_alignments(path_prefix):
         dict: Dictionary containing analysis results and raw data
     """
     read_df = pd.read_csv(f"{path_prefix}.per_read.tsv.gz", sep="\t")
+    alignments_df = pd.read_csv(f"{path_prefix}.alignments.tsv.gz", sep="\t")
+    
     # Filter to vector-only reads
     read_vector = read_df[read_df["reference_label"] == "vector"]
 
@@ -46,10 +48,10 @@ def analyze_alignments(path_prefix):
 
     result = {
         "agg_ref_type": get_ref_type_agg(
-            read_df, total_read_count_all, total_read_count_vector, total_read_count_lambda
+            read_df, alignments_df, total_read_count_all, total_read_count_vector, total_read_count_lambda
         ),
         "agg_subtype": get_subtype_agg(
-            read_vector, total_read_count_all, total_read_count_vector, total_read_count_lambda
+            read_vector, alignments_df, total_read_count_all, total_read_count_vector, total_read_count_lambda
         ),
     }
 
@@ -62,15 +64,26 @@ def analyze_alignments(path_prefix):
     return result
 
 
-def get_ref_type_agg(read_df, total_read_count_all, total_read_count_vector, total_read_count_lambda):
+def get_ref_type_agg(read_df, alignments_df, total_read_count_all, total_read_count_vector, total_read_count_lambda):
     """Counts and percentages of reference labels and types."""
-    df = (
-        read_df.groupby(["reference_label", "assigned_type"], dropna=False)[
-            "effective_count"
-        ]
-        .sum()
-        .reset_index(name="effective_count")
+    # Merge the read_df with alignments_df to get map_len
+    merged_df = pd.merge(
+        read_df[["read_id", "reference_label", "assigned_type", "effective_count"]], 
+        alignments_df[alignments_df["is_mapped"] == "Y"][["read_id", "map_len"]],
+        on="read_id",
+        how="left"  # Keep all reads, even if they don't have alignments
     )
+    merged_df["map_len"] = merged_df["map_len"].fillna(0)
+    
+    # Now do a single groupby to get both effective_count and base
+    df = merged_df.groupby(["reference_label", "assigned_type"], dropna=False).agg({
+        "effective_count": "sum",
+        "map_len": "sum"
+    }).reset_index()
+    
+    # Rename map_len to base
+    df = df.rename(columns={"map_len": "base"})
+    
     df = df.sort_values(
         ["reference_label", "effective_count"], ascending=[False, False]
     )
@@ -86,7 +99,7 @@ def get_ref_type_agg(read_df, total_read_count_all, total_read_count_vector, tot
     return df
 
 
-def get_subtype_agg(read_vector, total_read_count_all, total_read_count_vector, total_read_count_lambda):
+def get_subtype_agg(read_vector, alignments_df, total_read_count_all, total_read_count_vector, total_read_count_lambda):
     """Counts and percentages of assigned types and subtypes."""
     # Handle case where there are no vector reads
     if read_vector.empty:
@@ -96,16 +109,30 @@ def get_subtype_agg(read_vector, total_read_count_all, total_read_count_vector,
             "assigned_type": [pd.NA],
             "assigned_subtype": [pd.NA],
             "effective_count": [0],  # Use 0 so R sum() works correctly
+            "base": [0],
             "pct_vector": [0.0],
             "pct_total": [0.0], 
             "pct_wo_lambda": [0.0]
         })
     
-    df = (
-        read_vector.groupby(["assigned_type", "assigned_subtype"])["effective_count"]
-        .sum()
-        .reset_index(name="effective_count")
+    # Merge read_vector with alignments_df to get map_len
+    merged_df = pd.merge(
+        read_vector[["read_id", "assigned_type", "assigned_subtype", "effective_count"]],
+        alignments_df[alignments_df["is_mapped"] == "Y"][["read_id", "map_len"]],
+        on="read_id",
+        how="left"
     )
+    merged_df["map_len"] = merged_df["map_len"].fillna(0)
+    
+    # Single groupby to get both effective_count and base
+    df = merged_df.groupby(["assigned_type", "assigned_subtype"]).agg({
+        "effective_count": "sum",
+        "map_len": "sum"
+    }).reset_index()
+    
+    # Rename map_len to base
+    df = df.rename(columns={"map_len": "base"})
+    
     df = df.sort_values(["assigned_type", "effective_count"], ascending=[False, False])
     df["pct_vector"] = round(df["effective_count"] * 100 / total_read_count_vector, 2)
     df["pct_total"] = round(df["effective_count"] * 100 / total_read_count_all, 2)

From 44d1fd423aab74841ae6fce5c8755d37c4a757e9 Mon Sep 17 00:00:00 2001
From: Beibei Chen <beibei@formbio.com>
Date: Mon, 6 Oct 2025 15:30:57 -0500
Subject: [PATCH 2/8] update

---
 src/aggregate_tables.py | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/src/aggregate_tables.py b/src/aggregate_tables.py
index 773fcf7..f9654c0 100755
--- a/src/aggregate_tables.py
+++ b/src/aggregate_tables.py
@@ -36,6 +36,9 @@ def analyze_alignments(path_prefix):
     read_df = pd.read_csv(f"{path_prefix}.per_read.tsv.gz", sep="\t")
     alignments_df = pd.read_csv(f"{path_prefix}.alignments.tsv.gz", sep="\t")
     
+    # Pre-aggregate the alignment data by read_id (sum map_len)
+    map_len_df = alignments_df[alignments_df["is_mapped"] == "Y"].groupby("read_id")["map_len"].sum().reset_index()
+    
     # Filter to vector-only reads
     read_vector = read_df[read_df["reference_label"] == "vector"]
 
@@ -48,10 +51,10 @@ def analyze_alignments(path_prefix):
 
     result = {
         "agg_ref_type": get_ref_type_agg(
-            read_df, alignments_df, total_read_count_all, total_read_count_vector, total_read_count_lambda
+            read_df, map_len_df, total_read_count_all, total_read_count_vector, total_read_count_lambda
         ),
         "agg_subtype": get_subtype_agg(
-            read_vector, alignments_df, total_read_count_all, total_read_count_vector, total_read_count_lambda
+            read_vector, map_len_df, total_read_count_all, total_read_count_vector, total_read_count_lambda
         ),
     }
 
@@ -64,18 +67,18 @@ def analyze_alignments(path_prefix):
     return result
 
 
-def get_ref_type_agg(read_df, alignments_df, total_read_count_all, total_read_count_vector, total_read_count_lambda):
+def get_ref_type_agg(read_df, map_len_df, total_read_count_all, total_read_count_vector, total_read_count_lambda):
     """Counts and percentages of reference labels and types."""
-    # Merge the read_df with alignments_df to get map_len
+    # Merge read_df with pre-aggregated map_len data
     merged_df = pd.merge(
         read_df[["read_id", "reference_label", "assigned_type", "effective_count"]], 
-        alignments_df[alignments_df["is_mapped"] == "Y"][["read_id", "map_len"]],
+        map_len_df,
         on="read_id",
         how="left"  # Keep all reads, even if they don't have alignments
     )
     merged_df["map_len"] = merged_df["map_len"].fillna(0)
     
-    # Now do a single groupby to get both effective_count and base
+    # Do a single groupby to get both effective_count and base
     df = merged_df.groupby(["reference_label", "assigned_type"], dropna=False).agg({
         "effective_count": "sum",
         "map_len": "sum"
@@ -99,7 +102,7 @@ def get_ref_type_agg(read_df, alignments_df, total_read_count_all, total_read_co
     return df
 
 
-def get_subtype_agg(read_vector, alignments_df, total_read_count_all, total_read_count_vector, total_read_count_lambda):
+def get_subtype_agg(read_vector, map_len_df, total_read_count_all, total_read_count_vector, total_read_count_lambda):
     """Counts and percentages of assigned types and subtypes."""
     # Handle case where there are no vector reads
     if read_vector.empty:
@@ -115,16 +118,16 @@ def get_subtype_agg(read_vector, alignments_df, total_read_count_all, total_read
             "pct_wo_lambda": [0.0]
         })
     
-    # Merge read_vector with alignments_df to get map_len
+    # Merge read_vector with pre-aggregated map_len data
     merged_df = pd.merge(
         read_vector[["read_id", "assigned_type", "assigned_subtype", "effective_count"]],
-        alignments_df[alignments_df["is_mapped"] == "Y"][["read_id", "map_len"]],
+        map_len_df,
         on="read_id",
         how="left"
     )
     merged_df["map_len"] = merged_df["map_len"].fillna(0)
     
-    # Single groupby to get both effective_count and base
+    # Group by to get both effective_count and base
     df = merged_df.groupby(["assigned_type", "assigned_subtype"]).agg({
         "effective_count": "sum",
         "map_len": "sum"

From 739b3d8fa725aaf1af53ae1f975307ba19e687fc Mon Sep 17 00:00:00 2001
From: Beibei Chen <beibei@formbio.com>
Date: Mon, 6 Oct 2025 15:56:09 -0500
Subject: [PATCH 3/8] add basecount

---
 src/aggregate_tables.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/aggregate_tables.py b/src/aggregate_tables.py
index f9654c0..4b25ddc 100755
--- a/src/aggregate_tables.py
+++ b/src/aggregate_tables.py
@@ -145,11 +145,17 @@ def get_subtype_agg(read_vector, map_len_df, total_read_count_all, total_read_co
 
 def get_flipflop_agg(ff_df):
     """Counts of flip-flop configurations."""
-    df = (
-        ff_df.groupby(["type", "subtype", "leftITR", "rightITR"])
-        .size()
-        .reset_index(name="count")
-    )
+    # Calculate length for each record
+    ff_df["length"] = ff_df["end"] - ff_df["start"]
+    
+    # Group by required columns and calculate count and sum of lengths
+    df = ff_df.groupby(["type", "subtype", "leftITR", "rightITR"]).agg({
+        "name": "count",    # This gives us the count
+        "length": "sum"     # This gives us the base (sum of all lengths)
+    }).reset_index()
+    
+    # Rename columns
+    df = df.rename(columns={"name": "count", "length": "base"})
     return df
 
 

From a66264c8dacdb4878e15c2fdbe8f704547e7f759 Mon Sep 17 00:00:00 2001
From: Beibei Chen <beibei@formbio.com>
Date: Mon, 6 Oct 2025 16:54:50 -0500
Subject: [PATCH 4/8] update rmd

---
 src/report.Rmd | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/report.Rmd b/src/report.Rmd
index 91375ef..9d82510 100644
--- a/src/report.Rmd
+++ b/src/report.Rmd
@@ -206,6 +206,7 @@ total_row <- agg_ref_type %>%
     reference_label = "Total",
     assigned_type = "",
     effective_count = sum(effective_count, na.rm = TRUE),
+    base = sum(base, na.rm=TRUE),
     counts_wo_lambda = sum(counts_wo_lambda, na.rm = TRUE),
     pct_wo_lambda = sum(pct_wo_lambda, na.rm = TRUE)
   )
@@ -219,6 +220,7 @@ flextable(agg_ref_type_with_total %>% select(! c(pct_vector, pct_total))) %>%
     "Mapped Reference", 
     "Assigned Type", 
     "Counts", 
+    "Base",
     "Counts Excluding Lambda", 
     "Frequency (%, Excluding Lambda)"
   ))
@@ -257,6 +259,7 @@ total_row <- vectypes %>%
   summarise(
     assigned_type = "Total",
     effective_count = sum(effective_count, na.rm = TRUE),
+    baes = sum(base, na.rm = TRUE),
     pct_vector = sum(pct_vector, na.rm = TRUE),
     pct_wo_lambda = sum(pct_wo_lambda, na.rm = TRUE)
   )
@@ -269,6 +272,7 @@ flextable(vectypes_with_total %>% select(assigned_type, effective_count, pct_wo_
   set_header_labels(values = c(
     "Assigned Type",
     "Counts",
+    "Base",
     "Frequency (%, Excluding Lambda)",
     "GOI Vector Frequency (%)"
   ))
@@ -343,6 +347,7 @@ total_row <- agg_subtype %>%
     assigned_type = "Total",
     assigned_subtype = "",
     effective_count = sum(effective_count, na.rm = TRUE),
+    base = sum(base, na.rm=TRUE),
     pct_vector= sum(pct_vector, na.rm = TRUE),
     pct_wo_lambda = sum(pct_wo_lambda, na.rm = TRUE)
   )
@@ -351,12 +356,13 @@ total_row <- agg_subtype %>%
 agg_subtype_with_total <- bind_rows(agg_subtype, total_row)
 
 # Create the flextable with reordered columns
-flextable(agg_subtype_with_total %>% select(assigned_type, assigned_subtype, effective_count, pct_wo_lambda, pct_vector)) %>%
+flextable(agg_subtype_with_total %>% select(assigned_type, assigned_subtype, effective_count, base, pct_wo_lambda, pct_vector)) %>%
   set_header_labels(
     values = c(
       "Assigned Type",
       "Assigned Subtype",
       "Counts",
+      "Base",
       "Frequency (%, Excluding Lambda)",
       "GOI Vector Frequency (%)"
     )
@@ -635,7 +641,7 @@ if (exists("df.flipflop") && (nrow(df.flipflop) > 1)) {
 ```{r fftable, message=FALSE, warning=FALSE, results='asis'}
 if (exists("df.flipflop") && (nrow(df.flipflop) > 1)) {
   flextable(df.flipflop) %>%
-    set_header_labels(values = c("type", "subtype", "leftITR", "rightITR", "count"))
+    set_header_labels(values = c("type", "subtype", "leftITR", "rightITR", "count", "base"))
 }
 ```
 

From f38cfca76cf1bc54b8878558b113a230f5904a3c Mon Sep 17 00:00:00 2001
From: Beibei Chen <beibei@formbio.com>
Date: Mon, 6 Oct 2025 20:14:53 -0500
Subject: [PATCH 5/8] update

---
 src/report.Rmd | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/report.Rmd b/src/report.Rmd
index 9d82510..2f2c44b 100644
--- a/src/report.Rmd
+++ b/src/report.Rmd
@@ -259,7 +259,7 @@ total_row <- vectypes %>%
   summarise(
     assigned_type = "Total",
     effective_count = sum(effective_count, na.rm = TRUE),
-    baes = sum(base, na.rm = TRUE),
+    base = sum(base, na.rm = TRUE),
     pct_vector = sum(pct_vector, na.rm = TRUE),
     pct_wo_lambda = sum(pct_wo_lambda, na.rm = TRUE)
   )
@@ -268,7 +268,7 @@ total_row <- vectypes %>%
 vectypes_with_total <- bind_rows(vectypes, total_row)
 
 # Create the flextable with reordered columns
-flextable(vectypes_with_total %>% select(assigned_type, effective_count, pct_wo_lambda, pct_vector)) %>%
+flextable(vectypes_with_total %>% select(assigned_type, effective_count, base, pct_wo_lambda, pct_vector)) %>%
   set_header_labels(values = c(
     "Assigned Type",
     "Counts",

From 9a153936950d8335fb0b6965600cce5674f8aff9 Mon Sep 17 00:00:00 2001
From: Beibei Chen <beibei@formbio.com>
Date: Tue, 7 Oct 2025 09:41:07 -0500
Subject: [PATCH 6/8] rename column

---
 src/report.Rmd | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/report.Rmd b/src/report.Rmd
index 2f2c44b..248963c 100644
--- a/src/report.Rmd
+++ b/src/report.Rmd
@@ -220,7 +220,7 @@ flextable(agg_ref_type_with_total %>% select(! c(pct_vector, pct_total))) %>%
     "Mapped Reference", 
     "Assigned Type", 
     "Counts", 
-    "Base",
+    "Base Counts",
     "Counts Excluding Lambda", 
     "Frequency (%, Excluding Lambda)"
   ))
@@ -272,7 +272,7 @@ flextable(vectypes_with_total %>% select(assigned_type, effective_count, base, p
   set_header_labels(values = c(
     "Assigned Type",
     "Counts",
-    "Base",
+    "Base Count",
     "Frequency (%, Excluding Lambda)",
     "GOI Vector Frequency (%)"
   ))
@@ -362,7 +362,7 @@ flextable(agg_subtype_with_total %>% select(assigned_type, assigned_subtype, eff
       "Assigned Type",
       "Assigned Subtype",
       "Counts",
-      "Base",
+      "Base Counts",
       "Frequency (%, Excluding Lambda)",
       "GOI Vector Frequency (%)"
     )
@@ -641,7 +641,7 @@ if (exists("df.flipflop") && (nrow(df.flipflop) > 1)) {
 ```{r fftable, message=FALSE, warning=FALSE, results='asis'}
 if (exists("df.flipflop") && (nrow(df.flipflop) > 1)) {
   flextable(df.flipflop) %>%
-    set_header_labels(values = c("type", "subtype", "leftITR", "rightITR", "count", "base"))
+    set_header_labels(values = c("type", "subtype", "leftITR", "rightITR", "count", "base counts"))
 }
 ```
 

From 2b4216f2a110cde5335bf8ca94e383d2a90d4590 Mon Sep 17 00:00:00 2001
From: Beibei Chen <beibei@formbio.com>
Date: Tue, 7 Oct 2025 10:25:37 -0500
Subject: [PATCH 7/8] no change on report

---
 src/report.Rmd | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/src/report.Rmd b/src/report.Rmd
index 248963c..f6bf0e2 100644
--- a/src/report.Rmd
+++ b/src/report.Rmd
@@ -1,3 +1,4 @@
+
 ---
 output:
   html_document:
@@ -206,7 +207,6 @@ total_row <- agg_ref_type %>%
     reference_label = "Total",
     assigned_type = "",
     effective_count = sum(effective_count, na.rm = TRUE),
-    base = sum(base, na.rm=TRUE),
     counts_wo_lambda = sum(counts_wo_lambda, na.rm = TRUE),
     pct_wo_lambda = sum(pct_wo_lambda, na.rm = TRUE)
   )
@@ -220,7 +220,6 @@ flextable(agg_ref_type_with_total %>% select(! c(pct_vector, pct_total))) %>%
     "Mapped Reference", 
     "Assigned Type", 
     "Counts", 
-    "Base Counts",
     "Counts Excluding Lambda", 
     "Frequency (%, Excluding Lambda)"
   ))
@@ -259,7 +258,6 @@ total_row <- vectypes %>%
   summarise(
     assigned_type = "Total",
     effective_count = sum(effective_count, na.rm = TRUE),
-    base = sum(base, na.rm = TRUE),
     pct_vector = sum(pct_vector, na.rm = TRUE),
     pct_wo_lambda = sum(pct_wo_lambda, na.rm = TRUE)
   )
@@ -268,11 +266,10 @@ total_row <- vectypes %>%
 vectypes_with_total <- bind_rows(vectypes, total_row)
 
 # Create the flextable with reordered columns
-flextable(vectypes_with_total %>% select(assigned_type, effective_count, base, pct_wo_lambda, pct_vector)) %>%
+flextable(vectypes_with_total %>% select(assigned_type, effective_count, pct_wo_lambda, pct_vector)) %>%
   set_header_labels(values = c(
     "Assigned Type",
     "Counts",
-    "Base Count",
     "Frequency (%, Excluding Lambda)",
     "GOI Vector Frequency (%)"
   ))
@@ -347,7 +344,6 @@ total_row <- agg_subtype %>%
     assigned_type = "Total",
     assigned_subtype = "",
     effective_count = sum(effective_count, na.rm = TRUE),
-    base = sum(base, na.rm=TRUE),
     pct_vector= sum(pct_vector, na.rm = TRUE),
     pct_wo_lambda = sum(pct_wo_lambda, na.rm = TRUE)
   )
@@ -356,13 +352,12 @@ total_row <- agg_subtype %>%
 agg_subtype_with_total <- bind_rows(agg_subtype, total_row)
 
 # Create the flextable with reordered columns
-flextable(agg_subtype_with_total %>% select(assigned_type, assigned_subtype, effective_count, base, pct_wo_lambda, pct_vector)) %>%
+flextable(agg_subtype_with_total %>% select(assigned_type, assigned_subtype, effective_count, pct_wo_lambda, pct_vector)) %>%
   set_header_labels(
     values = c(
       "Assigned Type",
       "Assigned Subtype",
       "Counts",
-      "Base Counts",
       "Frequency (%, Excluding Lambda)",
       "GOI Vector Frequency (%)"
     )
@@ -640,11 +635,12 @@ if (exists("df.flipflop") && (nrow(df.flipflop) > 1)) {
 
 ```{r fftable, message=FALSE, warning=FALSE, results='asis'}
 if (exists("df.flipflop") && (nrow(df.flipflop) > 1)) {
-  flextable(df.flipflop) %>%
-    set_header_labels(values = c("type", "subtype", "leftITR", "rightITR", "count", "base counts"))
+  flextable(df.flipflop %>% select(! c("base"))) %>%
+    set_header_labels(values = c("type", "subtype", "leftITR", "rightITR", "count"))
 }
 ```
 
+
 ```{r saverdata, message=FALSE, warning=FALSE}
 # Enable for troubleshooting -- takes some time to generate
 # save.image(file=paste0(params$path_prefix, '.Rdata'))

From 411247afada7d327c21a4893e01a31fff4df3656 Mon Sep 17 00:00:00 2001
From: Beibei Chen <beibei@formbio.com>
Date: Tue, 7 Oct 2025 10:43:40 -0500
Subject: [PATCH 8/8] column

---
 src/report.Rmd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/report.Rmd b/src/report.Rmd
index f6bf0e2..d2e0e3a 100644
--- a/src/report.Rmd
+++ b/src/report.Rmd
@@ -215,7 +215,7 @@ total_row <- agg_ref_type %>%
 agg_ref_type_with_total <- bind_rows(agg_ref_type, total_row)
 
 # Create the flextable
-flextable(agg_ref_type_with_total %>% select(! c(pct_vector, pct_total))) %>%
+flextable(agg_ref_type_with_total %>% select(! c(pct_vector, pct_total, "base"))) %>%
   set_header_labels(values = c(
     "Mapped Reference", 
     "Assigned Type",