From 699b39705e9125b14a6bdd8e85dedf9d158c9794 Mon Sep 17 00:00:00 2001 From: Beibei Chen Date: Mon, 6 Oct 2025 15:19:30 -0500 Subject: [PATCH 1/8] 1 table --- src/aggregate_tables.py | 55 ++++++++++++++++++++++++++++++----------- 1 file changed, 41 insertions(+), 14 deletions(-) diff --git a/src/aggregate_tables.py b/src/aggregate_tables.py index 6830c14..773fcf7 100755 --- a/src/aggregate_tables.py +++ b/src/aggregate_tables.py @@ -34,6 +34,8 @@ def analyze_alignments(path_prefix): dict: Dictionary containing analysis results and raw data """ read_df = pd.read_csv(f"{path_prefix}.per_read.tsv.gz", sep="\t") + alignments_df = pd.read_csv(f"{path_prefix}.alignments.tsv.gz", sep="\t") + # Filter to vector-only reads read_vector = read_df[read_df["reference_label"] == "vector"] @@ -46,10 +48,10 @@ def analyze_alignments(path_prefix): result = { "agg_ref_type": get_ref_type_agg( - read_df, total_read_count_all, total_read_count_vector, total_read_count_lambda + read_df, alignments_df, total_read_count_all, total_read_count_vector, total_read_count_lambda ), "agg_subtype": get_subtype_agg( - read_vector, total_read_count_all, total_read_count_vector, total_read_count_lambda + read_vector, alignments_df, total_read_count_all, total_read_count_vector, total_read_count_lambda ), } @@ -62,15 +64,26 @@ def analyze_alignments(path_prefix): return result -def get_ref_type_agg(read_df, total_read_count_all, total_read_count_vector, total_read_count_lambda): +def get_ref_type_agg(read_df, alignments_df, total_read_count_all, total_read_count_vector, total_read_count_lambda): """Counts and percentages of reference labels and types.""" - df = ( - read_df.groupby(["reference_label", "assigned_type"], dropna=False)[ - "effective_count" - ] - .sum() - .reset_index(name="effective_count") + # Merge the read_df with alignments_df to get map_len + merged_df = pd.merge( + read_df[["read_id", "reference_label", "assigned_type", "effective_count"]], + alignments_df[alignments_df["is_mapped"] == "Y"][["read_id", "map_len"]], + on="read_id", + how="left" # Keep all reads, even if they don't have alignments ) + merged_df["map_len"] = merged_df["map_len"].fillna(0) + + # Now do a single groupby to get both effective_count and base + df = merged_df.groupby(["reference_label", "assigned_type"], dropna=False).agg({ + "effective_count": "sum", + "map_len": "sum" + }).reset_index() + + # Rename map_len to base + df = df.rename(columns={"map_len": "base"}) + df = df.sort_values( ["reference_label", "effective_count"], ascending=[False, False] ) @@ -86,7 +99,7 @@ def get_ref_type_agg(read_df, total_read_count_all, total_read_count_vector, tot return df -def get_subtype_agg(read_vector, total_read_count_all, total_read_count_vector, total_read_count_lambda): +def get_subtype_agg(read_vector, alignments_df, total_read_count_all, total_read_count_vector, total_read_count_lambda): """Counts and percentages of assigned types and subtypes.""" # Handle case where there are no vector reads if read_vector.empty: @@ -96,16 +109,30 @@ def get_subtype_agg(read_vector, total_read_count_all, total_read_count_vector, "assigned_type": [pd.NA], "assigned_subtype": [pd.NA], "effective_count": [0], # Use 0 so R sum() works correctly + "base": [0], "pct_vector": [0.0], "pct_total": [0.0], "pct_wo_lambda": [0.0] }) - df = ( - read_vector.groupby(["assigned_type", "assigned_subtype"])["effective_count"] - .sum() - .reset_index(name="effective_count") + # Merge read_vector with alignments_df to get map_len + merged_df = pd.merge( + read_vector[["read_id", "assigned_type", "assigned_subtype", "effective_count"]], + alignments_df[alignments_df["is_mapped"] == "Y"][["read_id", "map_len"]], + on="read_id", + how="left" ) + merged_df["map_len"] = merged_df["map_len"].fillna(0) + + # Single groupby to get both effective_count and base + df = merged_df.groupby(["assigned_type", "assigned_subtype"]).agg({ + "effective_count": "sum", + "map_len": "sum" + }).reset_index() + + # Rename map_len to base + df = df.rename(columns={"map_len": "base"}) + df = df.sort_values(["assigned_type", "effective_count"], ascending=[False, False]) df["pct_vector"] = round(df["effective_count"] * 100 / total_read_count_vector, 2) df["pct_total"] = round(df["effective_count"] * 100 / total_read_count_all, 2) From 44d1fd423aab74841ae6fce5c8755d37c4a757e9 Mon Sep 17 00:00:00 2001 From: Beibei Chen Date: Mon, 6 Oct 2025 15:30:57 -0500 Subject: [PATCH 2/8] update --- src/aggregate_tables.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/src/aggregate_tables.py b/src/aggregate_tables.py index 773fcf7..f9654c0 100755 --- a/src/aggregate_tables.py +++ b/src/aggregate_tables.py @@ -36,6 +36,9 @@ def analyze_alignments(path_prefix): read_df = pd.read_csv(f"{path_prefix}.per_read.tsv.gz", sep="\t") alignments_df = pd.read_csv(f"{path_prefix}.alignments.tsv.gz", sep="\t") + # Pre-aggregate the alignment data by read_id (sum map_len) + map_len_df = alignments_df[alignments_df["is_mapped"] == "Y"].groupby("read_id")["map_len"].sum().reset_index() + # Filter to vector-only reads read_vector = read_df[read_df["reference_label"] == "vector"] @@ -48,10 +51,10 @@ def analyze_alignments(path_prefix): result = { "agg_ref_type": get_ref_type_agg( - read_df, alignments_df, total_read_count_all, total_read_count_vector, total_read_count_lambda + read_df, map_len_df, total_read_count_all, total_read_count_vector, total_read_count_lambda ), "agg_subtype": get_subtype_agg( - read_vector, alignments_df, total_read_count_all, total_read_count_vector, total_read_count_lambda + read_vector, map_len_df, total_read_count_all, total_read_count_vector, total_read_count_lambda ), } @@ -64,18 +67,18 @@ def analyze_alignments(path_prefix): return result -def get_ref_type_agg(read_df, alignments_df, total_read_count_all, total_read_count_vector, total_read_count_lambda): +def get_ref_type_agg(read_df, map_len_df, total_read_count_all, total_read_count_vector, total_read_count_lambda): """Counts and percentages of reference labels and types.""" - # Merge the read_df with alignments_df to get map_len + # Merge read_df with pre-aggregated map_len data merged_df = pd.merge( read_df[["read_id", "reference_label", "assigned_type", "effective_count"]], - alignments_df[alignments_df["is_mapped"] == "Y"][["read_id", "map_len"]], + map_len_df, on="read_id", how="left" # Keep all reads, even if they don't have alignments ) merged_df["map_len"] = merged_df["map_len"].fillna(0) - # Now do a single groupby to get both effective_count and base + # Do a single groupby to get both effective_count and base df = merged_df.groupby(["reference_label", "assigned_type"], dropna=False).agg({ "effective_count": "sum", "map_len": "sum" @@ -99,7 +102,7 @@ def get_ref_type_agg(read_df, alignments_df, total_read_count_all, total_read_co return df -def get_subtype_agg(read_vector, alignments_df, total_read_count_all, total_read_count_vector, total_read_count_lambda): +def get_subtype_agg(read_vector, map_len_df, total_read_count_all, total_read_count_vector, total_read_count_lambda): """Counts and percentages of assigned types and subtypes.""" # Handle case where there are no vector reads if read_vector.empty: @@ -115,16 +118,16 @@ def get_subtype_agg(read_vector, alignments_df, total_read_count_all, total_read "pct_wo_lambda": [0.0] }) - # Merge read_vector with alignments_df to get map_len + # Merge read_vector with pre-aggregated map_len data merged_df = pd.merge( read_vector[["read_id", "assigned_type", "assigned_subtype", "effective_count"]], - alignments_df[alignments_df["is_mapped"] == "Y"][["read_id", "map_len"]], + map_len_df, on="read_id", how="left" ) merged_df["map_len"] = merged_df["map_len"].fillna(0) - # Single groupby to get both effective_count and base + # Group by to get both effective_count and base df = merged_df.groupby(["assigned_type", "assigned_subtype"]).agg({ "effective_count": "sum", "map_len": "sum" From 739b3d8fa725aaf1af53ae1f975307ba19e687fc Mon Sep 17 00:00:00 2001 From: Beibei Chen Date: Mon, 6 Oct 2025 15:56:09 -0500 Subject: [PATCH 3/8] add basecount --- src/aggregate_tables.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/aggregate_tables.py b/src/aggregate_tables.py index f9654c0..4b25ddc 100755 --- a/src/aggregate_tables.py +++ b/src/aggregate_tables.py @@ -145,11 +145,17 @@ def get_subtype_agg(read_vector, map_len_df, total_read_count_all, total_read_co def get_flipflop_agg(ff_df): """Counts of flip-flop configurations.""" - df = ( - ff_df.groupby(["type", "subtype", "leftITR", "rightITR"]) - .size() - .reset_index(name="count") - ) + # Calculate length for each record + ff_df["length"] = ff_df["end"] - ff_df["start"] + + # Group by required columns and calculate count and sum of lengths + df = ff_df.groupby(["type", "subtype", "leftITR", "rightITR"]).agg({ + "name": "count", # This gives us the count + "length": "sum" # This gives us the base (sum of all lengths) + }).reset_index() + + # Rename columns + df = df.rename(columns={"name": "count", "length": "base"}) return df From a66264c8dacdb4878e15c2fdbe8f704547e7f759 Mon Sep 17 00:00:00 2001 From: Beibei Chen Date: Mon, 6 Oct 2025 16:54:50 -0500 Subject: [PATCH 4/8] update rmd --- src/report.Rmd | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/report.Rmd b/src/report.Rmd index 91375ef..9d82510 100644 --- a/src/report.Rmd +++ b/src/report.Rmd @@ -206,6 +206,7 @@ total_row <- agg_ref_type %>% reference_label = "Total", assigned_type = "", effective_count = sum(effective_count, na.rm = TRUE), + base = sum(base, na.rm=TRUE), counts_wo_lambda = sum(counts_wo_lambda, na.rm = TRUE), pct_wo_lambda = sum(pct_wo_lambda, na.rm = TRUE) ) @@ -219,6 +220,7 @@ flextable(agg_ref_type_with_total %>% select(! c(pct_vector, pct_total))) %>% "Mapped Reference", "Assigned Type", "Counts", + "Base", "Counts Excluding Lambda", "Frequency (%, Excluding Lambda)" )) @@ -257,6 +259,7 @@ total_row <- vectypes %>% summarise( assigned_type = "Total", effective_count = sum(effective_count, na.rm = TRUE), + baes = sum(base, na.rm = TRUE), pct_vector = sum(pct_vector, na.rm = TRUE), pct_wo_lambda = sum(pct_wo_lambda, na.rm = TRUE) ) @@ -269,6 +272,7 @@ flextable(vectypes_with_total %>% select(assigned_type, effective_count, pct_wo_ set_header_labels(values = c( "Assigned Type", "Counts", + "Base", "Frequency (%, Excluding Lambda)", "GOI Vector Frequency (%)" )) @@ -343,6 +347,7 @@ total_row <- agg_subtype %>% assigned_type = "Total", assigned_subtype = "", effective_count = sum(effective_count, na.rm = TRUE), + base = sum(base, na.rm=TRUE), pct_vector= sum(pct_vector, na.rm = TRUE), pct_wo_lambda = sum(pct_wo_lambda, na.rm = TRUE) ) @@ -351,12 +356,13 @@ total_row <- agg_subtype %>% agg_subtype_with_total <- bind_rows(agg_subtype, total_row) # Create the flextable with reordered columns -flextable(agg_subtype_with_total %>% select(assigned_type, assigned_subtype, effective_count, pct_wo_lambda, pct_vector)) %>% +flextable(agg_subtype_with_total %>% select(assigned_type, assigned_subtype, effective_count, base, pct_wo_lambda, pct_vector)) %>% set_header_labels( values = c( "Assigned Type", "Assigned Subtype", "Counts", + "Base", "Frequency (%, Excluding Lambda)", "GOI Vector Frequency (%)" ) @@ -635,7 +641,7 @@ if (exists("df.flipflop") && (nrow(df.flipflop) > 1)) { ```{r fftable, message=FALSE, warning=FALSE, results='asis'} if (exists("df.flipflop") && (nrow(df.flipflop) > 1)) { flextable(df.flipflop) %>% - set_header_labels(values = c("type", "subtype", "leftITR", "rightITR", "count")) + set_header_labels(values = c("type", "subtype", "leftITR", "rightITR", "count", "base")) } ``` From f38cfca76cf1bc54b8878558b113a230f5904a3c Mon Sep 17 00:00:00 2001 From: Beibei Chen Date: Mon, 6 Oct 2025 20:14:53 -0500 Subject: [PATCH 5/8] update --- src/report.Rmd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/report.Rmd b/src/report.Rmd index 9d82510..2f2c44b 100644 --- a/src/report.Rmd +++ b/src/report.Rmd @@ -259,7 +259,7 @@ total_row <- vectypes %>% summarise( assigned_type = "Total", effective_count = sum(effective_count, na.rm = TRUE), - baes = sum(base, na.rm = TRUE), + base = sum(base, na.rm = TRUE), pct_vector = sum(pct_vector, na.rm = TRUE), pct_wo_lambda = sum(pct_wo_lambda, na.rm = TRUE) ) @@ -268,7 +268,7 @@ total_row <- vectypes %>% vectypes_with_total <- bind_rows(vectypes, total_row) # Create the flextable with reordered columns -flextable(vectypes_with_total %>% select(assigned_type, effective_count, pct_wo_lambda, pct_vector)) %>% +flextable(vectypes_with_total %>% select(assigned_type, effective_count, base, pct_wo_lambda, pct_vector)) %>% set_header_labels(values = c( "Assigned Type", "Counts", From 9a153936950d8335fb0b6965600cce5674f8aff9 Mon Sep 17 00:00:00 2001 From: Beibei Chen Date: Tue, 7 Oct 2025 09:41:07 -0500 Subject: [PATCH 6/8] rename column --- src/report.Rmd | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/report.Rmd b/src/report.Rmd index 2f2c44b..248963c 100644 --- a/src/report.Rmd +++ b/src/report.Rmd @@ -220,7 +220,7 @@ flextable(agg_ref_type_with_total %>% select(! c(pct_vector, pct_total))) %>% "Mapped Reference", "Assigned Type", "Counts", - "Base", + "Base Counts", "Counts Excluding Lambda", "Frequency (%, Excluding Lambda)" )) @@ -272,7 +272,7 @@ flextable(vectypes_with_total %>% select(assigned_type, effective_count, base, p set_header_labels(values = c( "Assigned Type", "Counts", - "Base", + "Base Count", "Frequency (%, Excluding Lambda)", "GOI Vector Frequency (%)" )) @@ -362,7 +362,7 @@ flextable(agg_subtype_with_total %>% select(assigned_type, assigned_subtype, eff "Assigned Type", "Assigned Subtype", "Counts", - "Base", + "Base Counts", "Frequency (%, Excluding Lambda)", "GOI Vector Frequency (%)" ) @@ -641,7 +641,7 @@ if (exists("df.flipflop") && (nrow(df.flipflop) > 1)) { ```{r fftable, message=FALSE, warning=FALSE, results='asis'} if (exists("df.flipflop") && (nrow(df.flipflop) > 1)) { flextable(df.flipflop) %>% - set_header_labels(values = c("type", "subtype", "leftITR", "rightITR", "count", "base")) + set_header_labels(values = c("type", "subtype", "leftITR", "rightITR", "count", "base counts")) } ``` From 2b4216f2a110cde5335bf8ca94e383d2a90d4590 Mon Sep 17 00:00:00 2001 From: Beibei Chen Date: Tue, 7 Oct 2025 10:25:37 -0500 Subject: [PATCH 7/8] no change on report --- src/report.Rmd | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/src/report.Rmd b/src/report.Rmd index 248963c..f6bf0e2 100644 --- a/src/report.Rmd +++ b/src/report.Rmd @@ -1,3 +1,4 @@ + --- output: html_document: @@ -206,7 +207,6 @@ total_row <- agg_ref_type %>% reference_label = "Total", assigned_type = "", effective_count = sum(effective_count, na.rm = TRUE), - base = sum(base, na.rm=TRUE), counts_wo_lambda = sum(counts_wo_lambda, na.rm = TRUE), pct_wo_lambda = sum(pct_wo_lambda, na.rm = TRUE) ) @@ -220,7 +220,6 @@ flextable(agg_ref_type_with_total %>% select(! c(pct_vector, pct_total))) %>% "Mapped Reference", "Assigned Type", "Counts", - "Base Counts", "Counts Excluding Lambda", "Frequency (%, Excluding Lambda)" )) @@ -259,7 +258,6 @@ total_row <- vectypes %>% summarise( assigned_type = "Total", effective_count = sum(effective_count, na.rm = TRUE), - base = sum(base, na.rm = TRUE), pct_vector = sum(pct_vector, na.rm = TRUE), pct_wo_lambda = sum(pct_wo_lambda, na.rm = TRUE) ) @@ -268,11 +266,10 @@ total_row <- vectypes %>% vectypes_with_total <- bind_rows(vectypes, total_row) # Create the flextable with reordered columns -flextable(vectypes_with_total %>% select(assigned_type, effective_count, base, pct_wo_lambda, pct_vector)) %>% +flextable(vectypes_with_total %>% select(assigned_type, effective_count, pct_wo_lambda, pct_vector)) %>% set_header_labels(values = c( "Assigned Type", "Counts", - "Base Count", "Frequency (%, Excluding Lambda)", "GOI Vector Frequency (%)" )) @@ -347,7 +344,6 @@ total_row <- agg_subtype %>% assigned_type = "Total", assigned_subtype = "", effective_count = sum(effective_count, na.rm = TRUE), - base = sum(base, na.rm=TRUE), pct_vector= sum(pct_vector, na.rm = TRUE), pct_wo_lambda = sum(pct_wo_lambda, na.rm = TRUE) ) @@ -356,13 +352,12 @@ total_row <- agg_subtype %>% agg_subtype_with_total <- bind_rows(agg_subtype, total_row) # Create the flextable with reordered columns -flextable(agg_subtype_with_total %>% select(assigned_type, assigned_subtype, effective_count, base, pct_wo_lambda, pct_vector)) %>% +flextable(agg_subtype_with_total %>% select(assigned_type, assigned_subtype, effective_count, pct_wo_lambda, pct_vector)) %>% set_header_labels( values = c( "Assigned Type", "Assigned Subtype", "Counts", - "Base Counts", "Frequency (%, Excluding Lambda)", "GOI Vector Frequency (%)" ) @@ -640,11 +635,12 @@ if (exists("df.flipflop") && (nrow(df.flipflop) > 1)) { ```{r fftable, message=FALSE, warning=FALSE, results='asis'} if (exists("df.flipflop") && (nrow(df.flipflop) > 1)) { - flextable(df.flipflop) %>% - set_header_labels(values = c("type", "subtype", "leftITR", "rightITR", "count", "base counts")) + flextable(df.flipflop %>% select(! c("base"))) %>% + set_header_labels(values = c("type", "subtype", "leftITR", "rightITR", "count")) } ``` + ```{r saverdata, message=FALSE, warning=FALSE} # Enable for troubleshooting -- takes some time to generate # save.image(file=paste0(params$path_prefix, '.Rdata')) From 411247afada7d327c21a4893e01a31fff4df3656 Mon Sep 17 00:00:00 2001 From: Beibei Chen Date: Tue, 7 Oct 2025 10:43:40 -0500 Subject: [PATCH 8/8] column --- src/report.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/report.Rmd b/src/report.Rmd index f6bf0e2..d2e0e3a 100644 --- a/src/report.Rmd +++ b/src/report.Rmd @@ -215,7 +215,7 @@ total_row <- agg_ref_type %>% agg_ref_type_with_total <- bind_rows(agg_ref_type, total_row) # Create the flextable -flextable(agg_ref_type_with_total %>% select(! c(pct_vector, pct_total))) %>% +flextable(agg_ref_type_with_total %>% select(! c(pct_vector, pct_total, "base"))) %>% set_header_labels(values = c( "Mapped Reference", "Assigned Type",