Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 55 additions & 19 deletions src/aggregate_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ def analyze_alignments(path_prefix):
dict: Dictionary containing analysis results and raw data
"""
read_df = pd.read_csv(f"{path_prefix}.per_read.tsv.gz", sep="\t")
alignments_df = pd.read_csv(f"{path_prefix}.alignments.tsv.gz", sep="\t")

# Pre-aggregate the alignment data by read_id (sum map_len)
map_len_df = alignments_df[alignments_df["is_mapped"] == "Y"].groupby("read_id")["map_len"].sum().reset_index()

# Filter to vector-only reads
read_vector = read_df[read_df["reference_label"] == "vector"]

Expand All @@ -46,10 +51,10 @@ def analyze_alignments(path_prefix):

result = {
"agg_ref_type": get_ref_type_agg(
read_df, total_read_count_all, total_read_count_vector, total_read_count_lambda
read_df, map_len_df, total_read_count_all, total_read_count_vector, total_read_count_lambda
),
"agg_subtype": get_subtype_agg(
read_vector, total_read_count_all, total_read_count_vector, total_read_count_lambda
read_vector, map_len_df, total_read_count_all, total_read_count_vector, total_read_count_lambda
),
}

Expand All @@ -62,15 +67,26 @@ def analyze_alignments(path_prefix):
return result


def get_ref_type_agg(read_df, total_read_count_all, total_read_count_vector, total_read_count_lambda):
def get_ref_type_agg(read_df, map_len_df, total_read_count_all, total_read_count_vector, total_read_count_lambda):
"""Counts and percentages of reference labels and types."""
df = (
read_df.groupby(["reference_label", "assigned_type"], dropna=False)[
"effective_count"
]
.sum()
.reset_index(name="effective_count")
# Merge read_df with pre-aggregated map_len data
merged_df = pd.merge(
read_df[["read_id", "reference_label", "assigned_type", "effective_count"]],
map_len_df,
on="read_id",
how="left" # Keep all reads, even if they don't have alignments
)
merged_df["map_len"] = merged_df["map_len"].fillna(0)

# Do a single groupby to get both effective_count and base
df = merged_df.groupby(["reference_label", "assigned_type"], dropna=False).agg({
"effective_count": "sum",
"map_len": "sum"
}).reset_index()

# Rename map_len to base
df = df.rename(columns={"map_len": "base"})

df = df.sort_values(
["reference_label", "effective_count"], ascending=[False, False]
)
Expand All @@ -86,7 +102,7 @@ def get_ref_type_agg(read_df, total_read_count_all, total_read_count_vector, tot
return df


def get_subtype_agg(read_vector, total_read_count_all, total_read_count_vector, total_read_count_lambda):
def get_subtype_agg(read_vector, map_len_df, total_read_count_all, total_read_count_vector, total_read_count_lambda):
"""Counts and percentages of assigned types and subtypes."""
# Handle case where there are no vector reads
if read_vector.empty:
Expand All @@ -96,16 +112,30 @@ def get_subtype_agg(read_vector, total_read_count_all, total_read_count_vector,
"assigned_type": [pd.NA],
"assigned_subtype": [pd.NA],
"effective_count": [0], # Use 0 so R sum() works correctly
"base": [0],
"pct_vector": [0.0],
"pct_total": [0.0],
"pct_wo_lambda": [0.0]
})

df = (
read_vector.groupby(["assigned_type", "assigned_subtype"])["effective_count"]
.sum()
.reset_index(name="effective_count")
# Merge read_vector with pre-aggregated map_len data
merged_df = pd.merge(
read_vector[["read_id", "assigned_type", "assigned_subtype", "effective_count"]],
map_len_df,
on="read_id",
how="left"
)
merged_df["map_len"] = merged_df["map_len"].fillna(0)

# Group by to get both effective_count and base
df = merged_df.groupby(["assigned_type", "assigned_subtype"]).agg({
"effective_count": "sum",
"map_len": "sum"
}).reset_index()

# Rename map_len to base
df = df.rename(columns={"map_len": "base"})

df = df.sort_values(["assigned_type", "effective_count"], ascending=[False, False])
df["pct_vector"] = round(df["effective_count"] * 100 / total_read_count_vector, 2)
df["pct_total"] = round(df["effective_count"] * 100 / total_read_count_all, 2)
Expand All @@ -115,11 +145,17 @@ def get_subtype_agg(read_vector, total_read_count_all, total_read_count_vector,

def get_flipflop_agg(ff_df):
"""Counts of flip-flop configurations."""
df = (
ff_df.groupby(["type", "subtype", "leftITR", "rightITR"])
.size()
.reset_index(name="count")
)
# Calculate length for each record
ff_df["length"] = ff_df["end"] - ff_df["start"]

# Group by required columns and calculate count and sum of lengths
df = ff_df.groupby(["type", "subtype", "leftITR", "rightITR"]).agg({
"name": "count", # This gives us the count
"length": "sum" # This gives us the base (sum of all lengths)
}).reset_index()

# Rename columns
df = df.rename(columns={"name": "count", "length": "base"})
return df


Expand Down
6 changes: 4 additions & 2 deletions src/report.Rmd
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@

---
output:
html_document:
Expand Down Expand Up @@ -214,7 +215,7 @@ total_row <- agg_ref_type %>%
agg_ref_type_with_total <- bind_rows(agg_ref_type, total_row)

# Create the flextable
flextable(agg_ref_type_with_total %>% select(! c(pct_vector, pct_total))) %>%
flextable(agg_ref_type_with_total %>% select(! c(pct_vector, pct_total, "base"))) %>%
set_header_labels(values = c(
"Mapped Reference",
"Assigned Type",
Expand Down Expand Up @@ -634,11 +635,12 @@ if (exists("df.flipflop") && (nrow(df.flipflop) > 1)) {

```{r fftable, message=FALSE, warning=FALSE, results='asis'}
if (exists("df.flipflop") && (nrow(df.flipflop) > 1)) {
flextable(df.flipflop) %>%
flextable(df.flipflop %>% select(! c("base"))) %>%
set_header_labels(values = c("type", "subtype", "leftITR", "rightITR", "count"))
}
```


```{r saverdata, message=FALSE, warning=FALSE}
# Enable for troubleshooting -- takes some time to generate
# save.image(file=paste0(params$path_prefix, '.Rdata'))
Expand Down
Loading