From 49a88ed16a08266db9bfb83d26be83fed2484ee4 Mon Sep 17 00:00:00 2001 From: Elizaveta Vlasova Date: Fri, 16 May 2025 22:25:57 +0300 Subject: [PATCH] added full reports for pyigmap pipeline --- bin/reporter/run.py | 70 ++++++++------ bin/reporter/utils.py | 79 ++++++++++++++-- bin/reporter/viz/__init__.py | 0 bin/reporter/viz/chain_flow.py | 50 ++++++++++ bin/reporter/viz/final_tab.py | 93 ++++++++++++++++++ bin/reporter/viz/igblast_tab.py | 71 ++++++++++++++ bin/reporter/viz/plot_style.py | 13 +++ bin/reporter/viz/report.py | 41 ++++++++ bin/reporter/{viz.py => viz/umi_tab.py} | 121 +++++++++--------------- 9 files changed, 426 insertions(+), 112 deletions(-) create mode 100644 bin/reporter/viz/__init__.py create mode 100644 bin/reporter/viz/chain_flow.py create mode 100644 bin/reporter/viz/final_tab.py create mode 100644 bin/reporter/viz/igblast_tab.py create mode 100644 bin/reporter/viz/plot_style.py create mode 100644 bin/reporter/viz/report.py rename bin/reporter/{viz.py => viz/umi_tab.py} (80%) diff --git a/bin/reporter/run.py b/bin/reporter/run.py index cfd9e85..a7c4cd5 100644 --- a/bin/reporter/run.py +++ b/bin/reporter/run.py @@ -3,23 +3,47 @@ import pandas as pd from logger import set_logger - -from utils import get_read_to_umi_mapping, get_consensus_group_size_per_read -from viz import create_report +from bin.reporter.utils import ( + parse_UMI_data, + parse_vidjil_fasta, + parse_igblast_tsv, + load_final_outputs +) +from bin.reporter.viz.report import create_report logger = set_logger(name=__file__) + def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser() - parser.add_argument('--in-fq1-pyumi', help='Input fastq.gz file after pyumi step, SE or PE pair 1', + parser.add_argument('--in-fq1-pyumi', + help='Input fastq.gz file after pyumi step, SE or PE pair 1. Needed if UMIs are specified', + required=False) + parser.add_argument('--in-fq2-pyumi', + help='Input fastq.gz file after pyumi step, PE pair 2. Needed if UMIs are specified') + + parser.add_argument('--in-fq1-calib', + help='Input fastq.gz file after calib step, SE or PE pair 2. Needed if UMIs are specified', + required=False) + parser.add_argument('--in-fq2-calib', + help='Input fastq.gz file after calib step, PE pair 2. Needed if UMIs are specified') + + parser.add_argument('--umi-reverse', action='store_true') + + parser.add_argument('--vidjil-fasta', + help='Input file with vidjil algo results. Needed if this step was used') + + parser.add_argument('--igblast-tsv', + help='File with IgBlast results', required=True) - parser.add_argument('--in-fq2-pyumi', help='Input fastq.gz file after pyumi step, PE pair 2') - parser.add_argument('--in-fq1-calib', help='Input fastq.gz file after calib step, SE or PE pair 2', + parser.add_argument('--final-stat', + help='File with final results (json stat file)', required=True) - parser.add_argument('--in-fq2-calib', help='Input fastq.gz file after calib step, PE pair 2') - parser.add_argument('--umi-reverse', action='store_true') + parser.add_argument('--final-clones', + help='File with IgBlast results (clone data file)', + required=True) parser.add_argument('--report-file') @@ -30,28 +54,18 @@ def main() -> None: args = parse_args() logger.info(f"Starting program with the following arguments: {args}") - pyumi_data_path = args.in_fq1_pyumi if not args.umi_reverse else args.in_fq2_pyumi - calib_data_path = args.in_fq1_calib if not args.umi_reverse else args.in_fq2_calib - - logger.info(f"Started reading initial data from {pyumi_data_path}") - umi_to_count_mapping_pre, id_to_umi, sequences = get_read_to_umi_mapping(pyumi_data_path) - - logger.info(f"Started reading calib data from {calib_data_path}") - umi_to_count_mapping_post = get_consensus_group_size_per_read(calib_data_path, id_to_umi) - - logger.info(f"Created dataset") - res = pd.DataFrame({'umi': umi_to_count_mapping_pre.keys(), - 'read_num_pre_dedup': umi_to_count_mapping_pre.values()}).merge( - pd.DataFrame({'umi': umi_to_count_mapping_post.keys(), - 'read_num_post_dedup': umi_to_count_mapping_post.values()}), how='outer' - ).fillna(0) - - res['umi'] = res['umi'].apply(lambda x: x.decode('ascii')) + res_umi, sequences_umi = parse_UMI_data(args) + vidjil_df = parse_vidjil_fasta(args.vidjil_fasta) + igblast_df = parse_igblast_tsv(args.igblast_tsv) + stats, final_clones = load_final_outputs(args.final_stat, args.final_clones) logger.info('Started report creation...') - create_report(res, sequences, report_file_name=args.report_file) + create_report(vidjil_df=vidjil_df, igblast_df=igblast_df, + umi_res=res_umi, umi_sequences=sequences_umi, + final_clones_df=final_clones, + report_file_name=args.report_file) logger.info(f'Report saved successfully in {args.report_file}.') - + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/bin/reporter/utils.py b/bin/reporter/utils.py index 1fe6b1a..cb6b21e 100644 --- a/bin/reporter/utils.py +++ b/bin/reporter/utils.py @@ -1,5 +1,11 @@ import gzip from collections import defaultdict, Counter +import pandas as pd +import re +from logger import set_logger +import numpy as np +logger = set_logger(name=__file__) + def get_consensus_group_size_per_read(fastq_file: str, id_to_umi: dict[int, str]) -> dict[str, int]: @@ -10,12 +16,12 @@ def get_consensus_group_size_per_read(fastq_file: str, id_to_umi: dict[int, str] while header := f.readline().strip().decode("utf-8"): # read header string if not header: break - f.readline() - f.readline() - f.readline() + _ = f.readline() + _ = f.readline() + _ = f.readline() group_ids = header.split('\t')[1].split(';')[:-1] - consensus_group_size = len(group_ids) # @0 145607;265853;563279; + consensus_group_size = len(group_ids) umi = Counter([id_to_umi[int(x)] for x in group_ids]).most_common(1)[0][0] umi_to_group_size_per_read[umi] += consensus_group_size @@ -35,8 +41,8 @@ def get_read_to_umi_mapping(fastq_file: str, umi_len: int = 12) -> dict[str, int break sequence = f.readline() sequences.append(sequence) - f.readline() - f.readline() + _ = f.readline() + _ = f.readline() umi = sequence[:umi_len] @@ -44,4 +50,63 @@ def get_read_to_umi_mapping(fastq_file: str, umi_len: int = 12) -> dict[str, int id_to_umi[read_id] = umi read_id += 1 - return umi_to_count, id_to_umi, sequences \ No newline at end of file + return umi_to_count, id_to_umi, sequences + +def parse_vidjil_fasta(path: str) -> pd.DataFrame: + if path is None: + return None + logger.info(f"Started reading vidjil data from {path}") + headers = [] + with gzip.open(path, "rt") as f: + for line in f: + if line.startswith(">"): + header = line[1:].strip() + sequence_id = header.split()[0] + match = re.search(r"(\w+) SEG_", header) + locus_from_seg = match.group(1) if match else None + headers.append({ + "header": header, + "sequence_id": sequence_id, + "locus": locus_from_seg + }) + return pd.DataFrame(headers) + + +def parse_UMI_data(args): + if args.in_fq1_pyumi is None: + return None, None + pyumi_data_path = args.in_fq1_pyumi if not args.umi_reverse else args.in_fq2_pyumi + calib_data_path = args.in_fq1_calib if not args.umi_reverse else args.in_fq2_calib + + logger.info(f"Started reading initial data from {pyumi_data_path}") + umi_to_count_mapping_pre, id_to_umi, sequences = get_read_to_umi_mapping(pyumi_data_path) + + logger.info(f"Started reading calib data from {calib_data_path}") + umi_to_count_mapping_post = get_consensus_group_size_per_read(calib_data_path, id_to_umi) + + logger.info(f"Created dataset for UMI processing") + res = pd.DataFrame({'umi': umi_to_count_mapping_pre.keys(), + 'read_num_pre_dedup': umi_to_count_mapping_pre.values()}).merge( + pd.DataFrame({'umi': umi_to_count_mapping_post.keys(), + 'read_num_post_dedup': umi_to_count_mapping_post.values()}), how='outer' + ).fillna(0) + return res, sequences + +def parse_igblast_tsv(path: str) -> pd.DataFrame: + logger.info(f"Started reading igblast data from {path}") + return pd.read_csv(path, sep="\t", comment="#", dtype=str) + + +def load_final_outputs(stat_path, clone_path): + import json + logger.info(f"Started reading final pyigmap data from {stat_path}, {clone_path}") + + with open(stat_path) as f: + stats = json.load(f) + clone_df = pd.read_csv(clone_path, sep="\t") + return stats, clone_df + +def shannon_entropy(probs, base=np.e): + probs = np.array(probs) + probs = probs[probs > 0] + return -np.sum(probs * np.log(probs)) / np.log(base) diff --git a/bin/reporter/viz/__init__.py b/bin/reporter/viz/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/bin/reporter/viz/chain_flow.py b/bin/reporter/viz/chain_flow.py new file mode 100644 index 0000000..5f2cea1 --- /dev/null +++ b/bin/reporter/viz/chain_flow.py @@ -0,0 +1,50 @@ +import pandas as pd +import panel as pn +import plotly.graph_objects as go +from bokeh.plotting import figure +from .plot_style import style_plotly_figure + +def plot_locus_barplot(df: pd.DataFrame, locus_column: str, title: str, color: str = "steelblue"): + counts = df[locus_column].value_counts().sort_values(ascending=False) + categories = list(counts.index) + p = figure(x_range=categories, height=450, width=600, title=title, toolbar_location=None, tools="") + p.vbar(x=counts.index, top=counts.values, width=0.8, color=color) + p.xgrid.grid_line_color = None + p.y_range.start = 0 + p.xaxis.major_label_orientation = "vertical" + return p, counts.max() + +def plot_locus_distributions(vidjil_df: pd.DataFrame, igblast_df: pd.DataFrame): + p1, max1 = plot_locus_barplot(vidjil_df, "locus", "Vidjil: chain distribution", color="steelblue") + p2, max2 = plot_locus_barplot(igblast_df, "locus", "IgBlast: chain distribution", color="darkorange") + y_max = max(max1, max2) * 1.05 + p1.y_range.end = y_max + p2.y_range.end = y_max + return pn.Row(p1, p2) + +def build_chain_sankey(vidjil_df, igblast_df): + merged = pd.DataFrame({ + "sequence_id": vidjil_df["sequence_id"], + "vidjil_locus": vidjil_df["locus"] + }).merge( + igblast_df[["sequence_id", "locus"]], on="sequence_id", how="inner" + ).rename(columns={"locus": "igblast_locus"}) + + counts = merged.groupby(["vidjil_locus", "igblast_locus"]).size().reset_index(name="value") + counts["source"] = "Vidjil: " + counts["vidjil_locus"] + counts["target"] = "IgBlast: " + counts["igblast_locus"] + + all_labels = list(pd.unique(counts[["source", "target"]].values.ravel())) + label_map = {l: i for i, l in enumerate(all_labels)} + + fig = go.Figure(data=[go.Sankey( + node=dict(label=all_labels), + link=dict( + source=counts["source"].map(label_map), + target=counts["target"].map(label_map), + value=counts["value"] + ) + )]) + + fig.update_layout(width=1200, height=500) + return style_plotly_figure(fig) \ No newline at end of file diff --git a/bin/reporter/viz/final_tab.py b/bin/reporter/viz/final_tab.py new file mode 100644 index 0000000..e7d50cf --- /dev/null +++ b/bin/reporter/viz/final_tab.py @@ -0,0 +1,93 @@ +import pandas as pd +import numpy as np +import panel as pn +import plotly.graph_objects as go +import plotly.express as px + +from .plot_style import style_plotly_figure +from bin.reporter.utils import shannon_entropy + + +def plot_final_diversity(clones_df: pd.DataFrame): + clone_sizes = clones_df['duplicate_count'] + freqs = clone_sizes / clone_sizes.sum() + + def hill_diversity(freqs, q): + if q == 1: + return np.exp(shannon_entropy(freqs, base=np.e)) + return np.power((freqs ** q).sum(), 1 / (1 - q)) + + hill_qs = np.linspace(0, 5, 100) + hill_values = [hill_diversity(freqs, q) for q in hill_qs] + + fig = go.Figure() + fig.add_trace(go.Scatter(x=hill_qs, y=hill_values, mode='lines', line=dict(color='teal'), name='Hill')) + fig.update_layout(title="Hill Diversity Curve", xaxis_title="q", yaxis_title="Hill Diversity", + width=600, height=350, showlegend=False) + return style_plotly_figure(fig) + + +def plot_final_distributions(clones_df: pd.DataFrame): + clones_df = clones_df.copy() + clones_df['clone_size'] = clones_df['duplicate_count'] + clones_df['cdr3_len'] = clones_df['cdr3_aa'].str.len() + clones_df['log_pgen'] = np.log10(clones_df['pgen'].replace(0, np.nan)).fillna(-100) + + p1 = px.histogram(clones_df, x='log_pgen', nbins=20, title='log10(Pgen)', color_discrete_sequence=['slateblue']) + p1.update_layout(width=400, height=350, xaxis_title='log10(pgen)', yaxis_title='Count') + + p2 = px.histogram(clones_df, x='cdr3_len', nbins=15, title='CDR3 Length', color_discrete_sequence=['darkgreen']) + p2.update_layout(width=400, height=350, xaxis_title='Length', yaxis_title='Count') + + p3 = px.histogram(clones_df, x='clone_size', nbins=10, title='Clone Sizes', color_discrete_sequence=['darkorange']) + p3.update_layout(width=400, height=350, xaxis_title='Clone size', yaxis_title='Frequency') + + return pn.Row( + pn.pane.Plotly(style_plotly_figure(p1)), + pn.pane.Plotly(style_plotly_figure(p2)), + pn.pane.Plotly(style_plotly_figure(p3)) + ) + + +def create_final_tab(clones_df): + dist = plot_final_distributions(clones_df) + hill = plot_final_diversity(clones_df) + return pn.Column("## Final Clone Summary", dist, hill) + + +def create_final_per_chain_panel(df: pd.DataFrame): + tabs = [] + for locus in sorted(df["locus"].dropna().unique()): + subset = df[df["locus"] == locus] + + top_n = 15 + row_genes = pn.Row( + pn.pane.Plotly(plot_top_genes(subset, "v_call", "Top V genes", top_n), width=400), + pn.pane.Plotly(plot_top_genes(subset, "d_call", "Top D genes", top_n), + width=400) if "d_call" in subset.columns and subset["d_call"].notna().any() else pn.Spacer(width=0), + pn.pane.Plotly(plot_top_genes(subset, "j_call", "Top J genes", top_n), width=400) + ) + + top_clones = subset.nlargest(10, "duplicate_count") + fig_clones = go.Figure(go.Bar( + y=top_clones['cdr3_aa'], + x=top_clones['duplicate_count'], + orientation='h', + marker_color='indianred')) + fig_clones.update_layout(title="Top 10 Clones by Count", xaxis_title="Count", width=600, height=350, showlegend=False) + + diversity = plot_final_diversity(subset) + second_summary = plot_final_distributions(subset) + + row_summary = pn.Row(pn.pane.Plotly(style_plotly_figure(fig_clones), width=500), pn.pane.Plotly(diversity, width=500)) + + tab = pn.Column(pn.pane.Markdown(f"### Chain: {locus}"), row_genes, second_summary, row_summary, width=1300) + tabs.append((locus, tab)) + return pn.Tabs(*tabs) + + +def plot_top_genes(df, gene_column, title, top_n=15): + counts = df[gene_column].dropna().apply(lambda x: x.split(',')[0]).value_counts().nlargest(top_n) + fig = go.Figure(go.Bar(y=counts.index, x=counts.values, marker_color="indianred", orientation='h')) + fig.update_layout(title=title, yaxis_title=gene_column, xaxis_title="Count", height=650, showlegend=False) + return style_plotly_figure(fig) diff --git a/bin/reporter/viz/igblast_tab.py b/bin/reporter/viz/igblast_tab.py new file mode 100644 index 0000000..3a0cb4c --- /dev/null +++ b/bin/reporter/viz/igblast_tab.py @@ -0,0 +1,71 @@ +import pandas as pd +import panel as pn +import plotly.graph_objects as go +import plotly.express as px + +from .plot_style import style_plotly_figure + + +def plot_top_genes(df, gene_column, title, top_n=15): + counts = df[gene_column].dropna().apply(lambda x: x.split(',')[0]).value_counts().nlargest(top_n) + fig = go.Figure(go.Bar(y=counts.index, x=counts.values, marker_color="indianred", orientation='h')) + fig.update_layout(title=title, yaxis_title=gene_column, xaxis_title="Count", height=650, showlegend=False) + return style_plotly_figure(fig) + + +def plot_cdr3_length(df): + if "junction_aa" in df.columns: + lengths = df["junction_aa"].dropna().apply(len) + title = "CDR3 Length (Amino Acids)" + elif "junction_length" in df.columns: + lengths = df["junction_length"].dropna().astype(int) + title = "CDR3 Length (Nucleotides)" + else: + return style_plotly_figure(go.Figure()) + fig = px.histogram(lengths, nbins=30, title=title) + fig.update_layout(height=350, showlegend=False) + return style_plotly_figure(fig) + + +def plot_productivity(df): + counts = df["productive"].value_counts() + fig = px.pie(values=counts.values, names=counts.index, title="Productivity Distribution", hole=0.3) + fig.update_traces(textinfo='label+percent', textfont_size=12, marker=dict(line=dict(color='#000000', width=1))) + fig.update_layout(height=350, showlegend=False) + return style_plotly_figure(fig) + + +def plot_v_identity(df): + if "v_identity" not in df.columns: + return style_plotly_figure(go.Figure()) + values = df["v_identity"].dropna().astype(float) + fig = px.histogram(values, nbins=40, title="V-gene Alignment Identity (%)") + fig.update_layout(height=350, showlegend=False) + return style_plotly_figure(fig) + + +def create_igblast_summary_panel(df: pd.DataFrame): + tabs = [] + for locus in sorted(df["locus"].dropna().unique()): + subset = df[df["locus"] == locus] + row1 = pn.Row( + pn.pane.Plotly(plot_top_genes(subset, "v_call", "Top V genes"), width=400), + pn.pane.Plotly(plot_top_genes(subset, "d_call", "Top D genes"), width=400) if "d_call" in subset.columns and + subset["d_call"].notna().any() else pn.Spacer(width=0), + pn.pane.Plotly(plot_top_genes(subset, "j_call", "Top J genes"), width=400) + ) + + row2 = pn.Row( + pn.pane.Plotly(plot_cdr3_length(subset), width=400), + pn.pane.Plotly(plot_productivity(subset), width=400), + pn.pane.Plotly(plot_v_identity(subset), width=400) + ) + + tab = pn.Column(pn.pane.Markdown(f"### Chain: {locus}"), row1, row2, width=1300) + tabs.append((locus, tab)) + return pn.Tabs(*tabs) + + +def add_igblast_tab_to_report(tabs, igblast_df): + tabs.append(("IgBlast Summary", create_igblast_summary_panel(igblast_df))) + return tabs diff --git a/bin/reporter/viz/plot_style.py b/bin/reporter/viz/plot_style.py new file mode 100644 index 0000000..c202a09 --- /dev/null +++ b/bin/reporter/viz/plot_style.py @@ -0,0 +1,13 @@ +import plotly.graph_objects as go + +def style_plotly_figure(fig: go.Figure, line_color='black', line_width=0.5) -> go.Figure: + fig.update_layout( + paper_bgcolor='rgba(0,0,0,0)', + # plot_bgcolor='rgba(0,0,0,0)', + ) + for trace in fig.data: + if hasattr(trace, "marker") and isinstance(trace.marker, dict) and "line" in trace.marker: + trace.marker.line.update(color=line_color, width=line_width) + elif hasattr(trace, "marker"): + trace.marker.line = dict(color=line_color, width=line_width) + return fig diff --git a/bin/reporter/viz/report.py b/bin/reporter/viz/report.py new file mode 100644 index 0000000..5c083aa --- /dev/null +++ b/bin/reporter/viz/report.py @@ -0,0 +1,41 @@ +import panel as pn +from .umi_tab import create_umi_tab +from .igblast_tab import create_igblast_summary_panel +from .final_tab import create_final_tab, create_final_per_chain_panel +from .chain_flow import plot_locus_distributions, build_chain_sankey + + +def create_report(final_clones_df, igblast_df, + umi_res=None, umi_sequences=None, + vidjil_df=None, + report_file_name='panel_report.html'): + tab_stats = pn.Column( + pn.pane.Markdown("## Vidjil+IgBlast Report"), + pn.pane.Markdown("## Chain distributions"), + plot_locus_distributions(vidjil_df, igblast_df), + pn.pane.Markdown("## Chain distribution flow: Vidjil → IgBlast"), + pn.pane.Plotly(build_chain_sankey(vidjil_df, igblast_df)) + ) if vidjil_df is not None else pn.Column( + "Vidjil tool was not launched in this run.") + + tab_igblast = create_igblast_summary_panel(igblast_df) + + tab_umi = create_umi_tab(umi_res, umi_sequences) if umi_res is not None else pn.Column( + "UMI information was not provided.") + + tab_final = create_final_tab(final_clones_df) + + tab_final_per_chain = create_final_per_chain_panel(final_clones_df) + + tabs = pn.Tabs( + ("UMI Info", tab_umi), + ('Vidjil + IgBlast', tab_stats), + ('IgBlast Stats', tab_igblast), + ('Final Clones', tab_final), + ('Final Per-Chain', tab_final_per_chain) + ) + report = pn.template.FastListTemplate( + title="Pyigmap launch report", + main=[tabs] + ) + report.save(report_file_name) diff --git a/bin/reporter/viz.py b/bin/reporter/viz/umi_tab.py similarity index 80% rename from bin/reporter/viz.py rename to bin/reporter/viz/umi_tab.py index f8e4ee5..9afb8c4 100644 --- a/bin/reporter/viz.py +++ b/bin/reporter/viz/umi_tab.py @@ -1,18 +1,17 @@ -import panel as pn -import matplotlib.pyplot as plt -import seaborn as sns -import logomaker +import numpy as np import pandas as pd +import panel as pn import plotly.graph_objects as go import plotly.express as px -import numpy as np -from collections import Counter from bokeh.plotting import figure from bokeh.models import HoverTool, NumeralTickFormatter +from .plot_style import style_plotly_figure + def compute_logo_matrix(sequences): seq_len = len(sequences[0]) + from collections import Counter counts = {i: Counter() for i in range(seq_len)} for seq in sequences: for i, char in enumerate(seq): @@ -54,7 +53,7 @@ def plotly_logo(sequences): showlegend=False, height=400 ) - return fig + return style_plotly_figure(fig) def bokeh_histogram(res, plot_by='read_num_pre_dedup', bins=20, x_max=None, y_range=None): @@ -70,23 +69,43 @@ def bokeh_histogram(res, plot_by='read_num_pre_dedup', bins=20, x_max=None, y_ra width=600, tools="pan,wheel_zoom,box_zoom,reset,save" ) - p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], - fill_color="skyblue", line_color="black") - + p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], fill_color="skyblue", line_color="black") p.x_range.end = x_max p.yaxis.formatter = NumeralTickFormatter(format="0") p.add_tools(HoverTool(tooltips=[("Bin", "@left - @right"), ("Count", "@top")], mode="mouse")) return p +def bokeh_weighted_usage(res, col='read_num_pre_dedup', bins=20, x_max=None, y_range=None): + df = res.copy() + total_reads = df[col].sum() + df['weight'] = df[col] / total_reads + values = df[col].values + weights = df['weight'].values + hist, edges = np.histogram(values, bins=np.logspace(np.log10(1), np.log10(values.max() + 1), bins), weights=weights) + p = figure( + title=f"Weighted UMI Usage ({col})", + x_axis_type="log", + x_axis_label='Read Count', + y_axis_label='Weighted UMI Frequency', + y_range=y_range, + height=500, + width=600, + tools="pan,wheel_zoom,box_zoom,reset,save" + ) + p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], fill_color="skyblue", line_color="black") + p.x_range.end = x_max + p.yaxis.formatter = NumeralTickFormatter(format="0.00") + p.add_tools(HoverTool(tooltips=[("Bin", "@left - @right"), ("Weighted Freq", "@top")], mode="mouse")) + return p + + def plot_umi_scatter(res): res = res.copy() res = res[res['read_num_pre_dedup'] > 0] res['delta'] = res['read_num_post_dedup'] / res['read_num_pre_dedup'] top10 = res.sort_values('delta', ascending=False).head(int(len(res) * 0.1)) - max_val = max(top10['read_num_pre_dedup'].max(), top10['read_num_post_dedup'].max()) * 1.01 - fig = px.scatter( top10, x="read_num_pre_dedup", @@ -100,7 +119,6 @@ def plot_umi_scatter(res): "read_num_post_dedup": "Post-dedup Read Count" } ) - fig.add_trace(go.Scatter( x=[0, max_val], y=[0, max_val], @@ -108,43 +126,9 @@ def plot_umi_scatter(res): line=dict(dash='dash', color='gray'), showlegend=False )) - - fig.update_layout( - height=600, - width=600, - # xaxis=dict(range=[np.log10(1), np.log10(max_val)], type="log"), - # yaxis=dict(range=[np.log10(1), np.log10(max_val)], type="log") - ) + fig.update_layout(height=600, width=600) fig.update_traces(marker=dict(size=6, color='darkorange')) - return fig - - -def bokeh_weighted_usage(res, col='read_num_pre_dedup', bins=20, x_max=None, y_range=None): - df = res.copy() - total_reads = df[col].sum() - df['weight'] = df[col] / total_reads - values = df[col].values - weights = df['weight'].values - hist, edges = np.histogram(values, bins=np.logspace(np.log10(1), np.log10(values.max() + 1), bins), weights=weights) - p = figure( - title=f"Weighted UMI Usage ({col})", - x_axis_type="log", - x_axis_label='Read Count', - y_axis_label='Weighted UMI Frequency', - # - y_range=y_range, - height=500, - width=600, - tools="pan,wheel_zoom,box_zoom,reset,save" - ) - p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], fill_color="skyblue", line_color="black") - - # p.x_range.start = x_min - p.x_range.end = x_max - p.yaxis.formatter = NumeralTickFormatter(format="0.00") - p.add_tools(HoverTool(tooltips=[("Bin", "@left - @right"), ("Weighted Freq", "@top")], mode="mouse")) - return p - + return style_plotly_figure(fig) def plot_barcode_rank(res): @@ -152,15 +136,12 @@ def plot_barcode_rank(res): df = df[df['read_num_post_dedup'] > 0] sorted_reads = df['read_num_post_dedup'].sort_values(ascending=False).reset_index(drop=True) cumulative_reads = sorted_reads.cumsum() - if len(sorted_reads) > 2000: log_space = np.geomspace(1, len(sorted_reads), num=2000).astype(int) - 1 - log_space = np.unique(np.clip(log_space, 0, len(sorted_reads)-1)) + log_space = np.unique(np.clip(log_space, 0, len(sorted_reads) - 1)) sorted_reads = sorted_reads.iloc[log_space].reset_index(drop=True) cumulative_reads = cumulative_reads.iloc[log_space].reset_index(drop=True) - fig = go.Figure() - fig.add_trace(go.Scatter( x=cumulative_reads, y=sorted_reads, @@ -168,7 +149,6 @@ def plot_barcode_rank(res): line=dict(color='teal'), name='read_counts' )) - fig.update_layout( title='UMI Rank Plot (Cumulative Reads vs UMI Read Count)', xaxis_title='Cumulative Reads', @@ -178,16 +158,14 @@ def plot_barcode_rank(res): height=500, width=600 ) + return style_plotly_figure(fig) - return fig -def create_report(res, umi_sequences, report_file_name='panel_report.html'): +def create_umi_tab(res, umi_sequences): x_max = max(res['read_num_pre_dedup'].max(), res['read_num_post_dedup'].max()) y_max_hist = max( - np.histogram(res['read_num_pre_dedup'], - bins=np.logspace(np.log10(1), np.log10(res['read_num_pre_dedup'].max() + 1), 20))[0].max(), - np.histogram(res['read_num_post_dedup'], - bins=np.logspace(np.log10(1), np.log10(res['read_num_post_dedup'].max() + 1), 20))[0].max() + np.histogram(res['read_num_pre_dedup'], bins=np.logspace(np.log10(1), np.log10(res['read_num_pre_dedup'].max() + 1), 20))[0].max(), + np.histogram(res['read_num_post_dedup'], bins=np.logspace(np.log10(1), np.log10(res['read_num_post_dedup'].max() + 1), 20))[0].max() ) y_max_weight = max( res['read_num_pre_dedup'].value_counts(normalize=True).max(), @@ -203,10 +181,8 @@ def create_report(res, umi_sequences, report_file_name='panel_report.html'): tab_umi_usage = pn.Column( pn.pane.Markdown("## UMI Usage Pre- and Post-Deduplication"), pn.Row( - pn.pane.Bokeh( - bokeh_histogram(res, plot_by='read_num_pre_dedup', x_max=x_max, y_range=y_range_hist)), - pn.pane.Bokeh( - bokeh_histogram(res, plot_by='read_num_post_dedup', x_max=x_max, y_range=y_range_hist)) + pn.pane.Bokeh(bokeh_histogram(res, plot_by='read_num_pre_dedup', x_max=x_max, y_range=y_range_hist)), + pn.pane.Bokeh(bokeh_histogram(res, plot_by='read_num_post_dedup', x_max=x_max, y_range=y_range_hist)) ) ) tab_scatter = pn.Column( @@ -216,26 +192,17 @@ def create_report(res, umi_sequences, report_file_name='panel_report.html'): tab_weighted = pn.Column( pn.pane.Markdown("## Weighted UMI Usage by Read Count"), pn.Row( - pn.pane.Bokeh( - bokeh_weighted_usage(res, col='read_num_pre_dedup', x_max=x_max, y_range=y_range_weight)), - pn.pane.Bokeh( - bokeh_weighted_usage(res, col='read_num_post_dedup', x_max=x_max, y_range=y_range_weight)) + pn.pane.Bokeh(bokeh_weighted_usage(res, col='read_num_pre_dedup', x_max=x_max, y_range=y_range_weight)), + pn.pane.Bokeh(bokeh_weighted_usage(res, col='read_num_post_dedup', x_max=x_max, y_range=y_range_weight)) ) ) tab_rank = pn.Column( pn.pane.Markdown("## UMI Barcode Rank Plot"), pn.pane.Plotly(plot_barcode_rank(res), config={'responsive': True}) ) - - tabs = pn.Tabs( + return pn.Tabs( ("Sequence Logo", tab_logo), ("UMI Usage", tab_umi_usage), ("Weighted UMI Usage", tab_weighted), ("UMI Usage Changes", tab_scatter), - ("Read Rank Plot", tab_rank) - ) - report = pn.template.FastListTemplate( - title="UMI Analysis Report", - main=[tabs] - ) - report.save(report_file_name) + ("Read Rank Plot", tab_rank))