From 49a88ed16a08266db9bfb83d26be83fed2484ee4 Mon Sep 17 00:00:00 2001
From: Elizaveta Vlasova <vlasova.elizaveta@yandex.ru>
Date: Fri, 16 May 2025 22:25:57 +0300
Subject: [PATCH] added full reports for pyigmap pipeline

---
 bin/reporter/run.py                     |  70 ++++++++------
 bin/reporter/utils.py                   |  79 ++++++++++++++--
 bin/reporter/viz/__init__.py            |   0
 bin/reporter/viz/chain_flow.py          |  50 ++++++++++
 bin/reporter/viz/final_tab.py           |  93 ++++++++++++++++++
 bin/reporter/viz/igblast_tab.py         |  71 ++++++++++++++
 bin/reporter/viz/plot_style.py          |  13 +++
 bin/reporter/viz/report.py              |  41 ++++++++
 bin/reporter/{viz.py => viz/umi_tab.py} | 121 +++++++++---------------
 9 files changed, 426 insertions(+), 112 deletions(-)
 create mode 100644 bin/reporter/viz/__init__.py
 create mode 100644 bin/reporter/viz/chain_flow.py
 create mode 100644 bin/reporter/viz/final_tab.py
 create mode 100644 bin/reporter/viz/igblast_tab.py
 create mode 100644 bin/reporter/viz/plot_style.py
 create mode 100644 bin/reporter/viz/report.py
 rename bin/reporter/{viz.py => viz/umi_tab.py} (80%)

diff --git a/bin/reporter/run.py b/bin/reporter/run.py
index cfd9e85..a7c4cd5 100644
--- a/bin/reporter/run.py
+++ b/bin/reporter/run.py
@@ -3,23 +3,47 @@
 import pandas as pd
 from logger import set_logger
 
-
-from utils import get_read_to_umi_mapping, get_consensus_group_size_per_read
-from viz import create_report
+from bin.reporter.utils import (
+    parse_UMI_data,
+    parse_vidjil_fasta,
+    parse_igblast_tsv,
+    load_final_outputs
+)
+from bin.reporter.viz.report import create_report
 
 logger = set_logger(name=__file__)
 
+
 def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser()
-    parser.add_argument('--in-fq1-pyumi', help='Input fastq.gz file after pyumi step, SE or PE pair 1',
+    parser.add_argument('--in-fq1-pyumi',
+                        help='Input fastq.gz file after pyumi step, SE or PE pair 1. Needed if UMIs are specified',
+                        required=False)
+    parser.add_argument('--in-fq2-pyumi',
+                        help='Input fastq.gz file after pyumi step, PE pair 2. Needed if UMIs are specified')
+
+    parser.add_argument('--in-fq1-calib',
+                        help='Input fastq.gz file after calib step, SE or PE pair 2. Needed if UMIs are specified',
+                        required=False)
+    parser.add_argument('--in-fq2-calib',
+                        help='Input fastq.gz file after calib step, PE pair 2. Needed if UMIs are specified')
+
+    parser.add_argument('--umi-reverse', action='store_true')
+
+    parser.add_argument('--vidjil-fasta',
+                        help='Input file with vidjil algo results. Needed if this step was used')
+
+    parser.add_argument('--igblast-tsv',
+                        help='File with IgBlast results',
                         required=True)
-    parser.add_argument('--in-fq2-pyumi', help='Input fastq.gz file after pyumi step, PE pair 2')
 
-    parser.add_argument('--in-fq1-calib', help='Input fastq.gz file after calib step, SE or PE pair 2',
+    parser.add_argument('--final-stat',
+                        help='File with final results (json stat file)',
                         required=True)
-    parser.add_argument('--in-fq2-calib', help='Input fastq.gz file after calib step, PE pair 2')
 
-    parser.add_argument('--umi-reverse', action='store_true')
+    parser.add_argument('--final-clones',
+                        help='File with IgBlast results (clone data file)',
+                        required=True)
 
     parser.add_argument('--report-file')
 
@@ -30,28 +54,18 @@ def main() -> None:
     args = parse_args()
     logger.info(f"Starting program with the following arguments: {args}")
 
-    pyumi_data_path = args.in_fq1_pyumi if not args.umi_reverse else args.in_fq2_pyumi
-    calib_data_path = args.in_fq1_calib if not args.umi_reverse else args.in_fq2_calib
-
-    logger.info(f"Started reading initial data from {pyumi_data_path}")
-    umi_to_count_mapping_pre, id_to_umi, sequences = get_read_to_umi_mapping(pyumi_data_path)
-
-    logger.info(f"Started reading calib data from {calib_data_path}")
-    umi_to_count_mapping_post = get_consensus_group_size_per_read(calib_data_path, id_to_umi)
-
-    logger.info(f"Created dataset")
-    res = pd.DataFrame({'umi': umi_to_count_mapping_pre.keys(),
-                        'read_num_pre_dedup': umi_to_count_mapping_pre.values()}).merge(
-        pd.DataFrame({'umi': umi_to_count_mapping_post.keys(),
-                      'read_num_post_dedup': umi_to_count_mapping_post.values()}), how='outer'
-    ).fillna(0)
-
-    res['umi'] = res['umi'].apply(lambda x: x.decode('ascii'))
+    res_umi, sequences_umi = parse_UMI_data(args)
+    vidjil_df = parse_vidjil_fasta(args.vidjil_fasta)
+    igblast_df = parse_igblast_tsv(args.igblast_tsv)
+    stats, final_clones = load_final_outputs(args.final_stat, args.final_clones)
 
     logger.info('Started report creation...')
-    create_report(res, sequences, report_file_name=args.report_file)
+    create_report(vidjil_df=vidjil_df, igblast_df=igblast_df,
+                  umi_res=res_umi, umi_sequences=sequences_umi,
+                  final_clones_df=final_clones,
+                  report_file_name=args.report_file)
     logger.info(f'Report saved successfully in {args.report_file}.')
-    
+
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/bin/reporter/utils.py b/bin/reporter/utils.py
index 1fe6b1a..cb6b21e 100644
--- a/bin/reporter/utils.py
+++ b/bin/reporter/utils.py
@@ -1,5 +1,11 @@
 import gzip
 from collections import defaultdict, Counter
+import pandas as pd
+import re
+from logger import set_logger
+import numpy as np
+logger = set_logger(name=__file__)
+
 
 
 def get_consensus_group_size_per_read(fastq_file: str, id_to_umi: dict[int, str]) -> dict[str, int]:
@@ -10,12 +16,12 @@ def get_consensus_group_size_per_read(fastq_file: str, id_to_umi: dict[int, str]
         while header := f.readline().strip().decode("utf-8"):  # read header string
             if not header:
                 break
-            f.readline()
-            f.readline()
-            f.readline()
+            _ = f.readline()
+            _ = f.readline()
+            _ = f.readline()
 
             group_ids = header.split('\t')[1].split(';')[:-1]
-            consensus_group_size = len(group_ids)  # @0  145607;265853;563279;
+            consensus_group_size = len(group_ids)
 
             umi = Counter([id_to_umi[int(x)] for x in group_ids]).most_common(1)[0][0]
             umi_to_group_size_per_read[umi] += consensus_group_size
@@ -35,8 +41,8 @@ def get_read_to_umi_mapping(fastq_file: str, umi_len: int = 12) -> dict[str, int
                 break
             sequence = f.readline()
             sequences.append(sequence)
-            f.readline()
-            f.readline()
+            _ = f.readline()
+            _ = f.readline()
 
             umi = sequence[:umi_len]
 
@@ -44,4 +50,63 @@ def get_read_to_umi_mapping(fastq_file: str, umi_len: int = 12) -> dict[str, int
             id_to_umi[read_id] = umi
             read_id += 1
 
-    return umi_to_count, id_to_umi, sequences
\ No newline at end of file
+    return umi_to_count, id_to_umi, sequences
+
+def parse_vidjil_fasta(path: str) -> pd.DataFrame:
+    if path is None:
+        return None
+    logger.info(f"Started reading vidjil data from {path}")
+    headers = []
+    with gzip.open(path, "rt") as f:
+        for line in f:
+            if line.startswith(">"):
+                header = line[1:].strip()
+                sequence_id = header.split()[0]
+                match = re.search(r"(\w+) SEG_", header)
+                locus_from_seg = match.group(1) if match else None
+                headers.append({
+                    "header": header,
+                    "sequence_id": sequence_id,
+                    "locus": locus_from_seg
+                })
+    return pd.DataFrame(headers)
+
+
+def parse_UMI_data(args):
+    if args.in_fq1_pyumi is None:
+        return None, None
+    pyumi_data_path = args.in_fq1_pyumi if not args.umi_reverse else args.in_fq2_pyumi
+    calib_data_path = args.in_fq1_calib if not args.umi_reverse else args.in_fq2_calib
+
+    logger.info(f"Started reading initial data from {pyumi_data_path}")
+    umi_to_count_mapping_pre, id_to_umi, sequences = get_read_to_umi_mapping(pyumi_data_path)
+
+    logger.info(f"Started reading calib data from {calib_data_path}")
+    umi_to_count_mapping_post = get_consensus_group_size_per_read(calib_data_path, id_to_umi)
+
+    logger.info(f"Created dataset for UMI processing")
+    res = pd.DataFrame({'umi': umi_to_count_mapping_pre.keys(),
+                        'read_num_pre_dedup': umi_to_count_mapping_pre.values()}).merge(
+        pd.DataFrame({'umi': umi_to_count_mapping_post.keys(),
+                      'read_num_post_dedup': umi_to_count_mapping_post.values()}), how='outer'
+    ).fillna(0)
+    return res, sequences
+
+def parse_igblast_tsv(path: str) -> pd.DataFrame:
+    logger.info(f"Started reading igblast data from {path}")
+    return pd.read_csv(path, sep="\t", comment="#", dtype=str)
+
+
+def load_final_outputs(stat_path, clone_path):
+    import json
+    logger.info(f"Started reading final pyigmap data from {stat_path}, {clone_path}")
+
+    with open(stat_path) as f:
+        stats = json.load(f)
+    clone_df = pd.read_csv(clone_path, sep="\t")
+    return stats, clone_df
+
+def shannon_entropy(probs, base=np.e):
+    probs = np.array(probs)
+    probs = probs[probs > 0]
+    return -np.sum(probs * np.log(probs)) / np.log(base)
diff --git a/bin/reporter/viz/__init__.py b/bin/reporter/viz/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/bin/reporter/viz/chain_flow.py b/bin/reporter/viz/chain_flow.py
new file mode 100644
index 0000000..5f2cea1
--- /dev/null
+++ b/bin/reporter/viz/chain_flow.py
@@ -0,0 +1,50 @@
+import pandas as pd
+import panel as pn
+import plotly.graph_objects as go
+from bokeh.plotting import figure
+from .plot_style import style_plotly_figure
+
+def plot_locus_barplot(df: pd.DataFrame, locus_column: str, title: str, color: str = "steelblue"):
+    counts = df[locus_column].value_counts().sort_values(ascending=False)
+    categories = list(counts.index)
+    p = figure(x_range=categories, height=450, width=600, title=title, toolbar_location=None, tools="")
+    p.vbar(x=counts.index, top=counts.values, width=0.8, color=color)
+    p.xgrid.grid_line_color = None
+    p.y_range.start = 0
+    p.xaxis.major_label_orientation = "vertical"
+    return p, counts.max()
+
+def plot_locus_distributions(vidjil_df: pd.DataFrame, igblast_df: pd.DataFrame):
+    p1, max1 = plot_locus_barplot(vidjil_df, "locus", "Vidjil: chain distribution", color="steelblue")
+    p2, max2 = plot_locus_barplot(igblast_df, "locus", "IgBlast: chain distribution", color="darkorange")
+    y_max = max(max1, max2) * 1.05
+    p1.y_range.end = y_max
+    p2.y_range.end = y_max
+    return pn.Row(p1, p2)
+
+def build_chain_sankey(vidjil_df, igblast_df):
+    merged = pd.DataFrame({
+        "sequence_id": vidjil_df["sequence_id"],
+        "vidjil_locus": vidjil_df["locus"]
+    }).merge(
+        igblast_df[["sequence_id", "locus"]], on="sequence_id", how="inner"
+    ).rename(columns={"locus": "igblast_locus"})
+
+    counts = merged.groupby(["vidjil_locus", "igblast_locus"]).size().reset_index(name="value")
+    counts["source"] = "Vidjil: " + counts["vidjil_locus"]
+    counts["target"] = "IgBlast: " + counts["igblast_locus"]
+
+    all_labels = list(pd.unique(counts[["source", "target"]].values.ravel()))
+    label_map = {l: i for i, l in enumerate(all_labels)}
+
+    fig = go.Figure(data=[go.Sankey(
+        node=dict(label=all_labels),
+        link=dict(
+            source=counts["source"].map(label_map),
+            target=counts["target"].map(label_map),
+            value=counts["value"]
+        )
+    )])
+
+    fig.update_layout(width=1200, height=500)
+    return style_plotly_figure(fig)
\ No newline at end of file
diff --git a/bin/reporter/viz/final_tab.py b/bin/reporter/viz/final_tab.py
new file mode 100644
index 0000000..e7d50cf
--- /dev/null
+++ b/bin/reporter/viz/final_tab.py
@@ -0,0 +1,93 @@
+import pandas as pd
+import numpy as np
+import panel as pn
+import plotly.graph_objects as go
+import plotly.express as px
+
+from .plot_style import style_plotly_figure
+from bin.reporter.utils import shannon_entropy
+
+
+def plot_final_diversity(clones_df: pd.DataFrame):
+    clone_sizes = clones_df['duplicate_count']
+    freqs = clone_sizes / clone_sizes.sum()
+
+    def hill_diversity(freqs, q):
+        if q == 1:
+            return np.exp(shannon_entropy(freqs, base=np.e))
+        return np.power((freqs ** q).sum(), 1 / (1 - q))
+
+    hill_qs = np.linspace(0, 5, 100)
+    hill_values = [hill_diversity(freqs, q) for q in hill_qs]
+
+    fig = go.Figure()
+    fig.add_trace(go.Scatter(x=hill_qs, y=hill_values, mode='lines', line=dict(color='teal'), name='Hill'))
+    fig.update_layout(title="Hill Diversity Curve", xaxis_title="q", yaxis_title="Hill Diversity",
+                      width=600, height=350, showlegend=False)
+    return style_plotly_figure(fig)
+
+
+def plot_final_distributions(clones_df: pd.DataFrame):
+    clones_df = clones_df.copy()
+    clones_df['clone_size'] = clones_df['duplicate_count']
+    clones_df['cdr3_len'] = clones_df['cdr3_aa'].str.len()
+    clones_df['log_pgen'] = np.log10(clones_df['pgen'].replace(0, np.nan)).fillna(-100)
+
+    p1 = px.histogram(clones_df, x='log_pgen', nbins=20, title='log10(Pgen)', color_discrete_sequence=['slateblue'])
+    p1.update_layout(width=400, height=350, xaxis_title='log10(pgen)', yaxis_title='Count')
+
+    p2 = px.histogram(clones_df, x='cdr3_len', nbins=15, title='CDR3 Length', color_discrete_sequence=['darkgreen'])
+    p2.update_layout(width=400, height=350, xaxis_title='Length', yaxis_title='Count')
+
+    p3 = px.histogram(clones_df, x='clone_size', nbins=10, title='Clone Sizes', color_discrete_sequence=['darkorange'])
+    p3.update_layout(width=400, height=350, xaxis_title='Clone size', yaxis_title='Frequency')
+
+    return pn.Row(
+        pn.pane.Plotly(style_plotly_figure(p1)),
+        pn.pane.Plotly(style_plotly_figure(p2)),
+        pn.pane.Plotly(style_plotly_figure(p3))
+    )
+
+
+def create_final_tab(clones_df):
+    dist = plot_final_distributions(clones_df)
+    hill = plot_final_diversity(clones_df)
+    return pn.Column("## Final Clone Summary", dist, hill)
+
+
+def create_final_per_chain_panel(df: pd.DataFrame):
+    tabs = []
+    for locus in sorted(df["locus"].dropna().unique()):
+        subset = df[df["locus"] == locus]
+
+        top_n = 15
+        row_genes = pn.Row(
+            pn.pane.Plotly(plot_top_genes(subset, "v_call", "Top V genes", top_n), width=400),
+            pn.pane.Plotly(plot_top_genes(subset, "d_call", "Top D genes", top_n),
+                           width=400) if "d_call" in subset.columns and subset["d_call"].notna().any() else pn.Spacer(width=0),
+            pn.pane.Plotly(plot_top_genes(subset, "j_call", "Top J genes", top_n), width=400)
+        )
+
+        top_clones = subset.nlargest(10, "duplicate_count")
+        fig_clones = go.Figure(go.Bar(
+            y=top_clones['cdr3_aa'],
+            x=top_clones['duplicate_count'],
+            orientation='h',
+            marker_color='indianred'))
+        fig_clones.update_layout(title="Top 10 Clones by Count", xaxis_title="Count", width=600, height=350, showlegend=False)
+
+        diversity = plot_final_diversity(subset)
+        second_summary = plot_final_distributions(subset)
+
+        row_summary = pn.Row(pn.pane.Plotly(style_plotly_figure(fig_clones), width=500), pn.pane.Plotly(diversity, width=500))
+
+        tab = pn.Column(pn.pane.Markdown(f"### Chain: {locus}"), row_genes, second_summary, row_summary, width=1300)
+        tabs.append((locus, tab))
+    return pn.Tabs(*tabs)
+
+
+def plot_top_genes(df, gene_column, title, top_n=15):
+    counts = df[gene_column].dropna().apply(lambda x: x.split(',')[0]).value_counts().nlargest(top_n)
+    fig = go.Figure(go.Bar(y=counts.index, x=counts.values, marker_color="indianred", orientation='h'))
+    fig.update_layout(title=title, yaxis_title=gene_column, xaxis_title="Count", height=650, showlegend=False)
+    return style_plotly_figure(fig)
diff --git a/bin/reporter/viz/igblast_tab.py b/bin/reporter/viz/igblast_tab.py
new file mode 100644
index 0000000..3a0cb4c
--- /dev/null
+++ b/bin/reporter/viz/igblast_tab.py
@@ -0,0 +1,71 @@
+import pandas as pd
+import panel as pn
+import plotly.graph_objects as go
+import plotly.express as px
+
+from .plot_style import style_plotly_figure
+
+
+def plot_top_genes(df, gene_column, title, top_n=15):
+    counts = df[gene_column].dropna().apply(lambda x: x.split(',')[0]).value_counts().nlargest(top_n)
+    fig = go.Figure(go.Bar(y=counts.index, x=counts.values, marker_color="indianred", orientation='h'))
+    fig.update_layout(title=title, yaxis_title=gene_column, xaxis_title="Count", height=650, showlegend=False)
+    return style_plotly_figure(fig)
+
+
+def plot_cdr3_length(df):
+    if "junction_aa" in df.columns:
+        lengths = df["junction_aa"].dropna().apply(len)
+        title = "CDR3 Length (Amino Acids)"
+    elif "junction_length" in df.columns:
+        lengths = df["junction_length"].dropna().astype(int)
+        title = "CDR3 Length (Nucleotides)"
+    else:
+        return style_plotly_figure(go.Figure())
+    fig = px.histogram(lengths, nbins=30, title=title)
+    fig.update_layout(height=350, showlegend=False)
+    return style_plotly_figure(fig)
+
+
+def plot_productivity(df):
+    counts = df["productive"].value_counts()
+    fig = px.pie(values=counts.values, names=counts.index, title="Productivity Distribution", hole=0.3)
+    fig.update_traces(textinfo='label+percent', textfont_size=12, marker=dict(line=dict(color='#000000', width=1)))
+    fig.update_layout(height=350, showlegend=False)
+    return style_plotly_figure(fig)
+
+
+def plot_v_identity(df):
+    if "v_identity" not in df.columns:
+        return style_plotly_figure(go.Figure())
+    values = df["v_identity"].dropna().astype(float)
+    fig = px.histogram(values, nbins=40, title="V-gene Alignment Identity (%)")
+    fig.update_layout(height=350, showlegend=False)
+    return style_plotly_figure(fig)
+
+
+def create_igblast_summary_panel(df: pd.DataFrame):
+    tabs = []
+    for locus in sorted(df["locus"].dropna().unique()):
+        subset = df[df["locus"] == locus]
+        row1 = pn.Row(
+            pn.pane.Plotly(plot_top_genes(subset, "v_call", "Top V genes"), width=400),
+            pn.pane.Plotly(plot_top_genes(subset, "d_call", "Top D genes"), width=400) if "d_call" in subset.columns and
+                                                                                          subset["d_call"].notna().any() else pn.Spacer(width=0),
+            pn.pane.Plotly(plot_top_genes(subset, "j_call", "Top J genes"), width=400)
+        )
+
+        row2 = pn.Row(
+            pn.pane.Plotly(plot_cdr3_length(subset), width=400),
+            pn.pane.Plotly(plot_productivity(subset), width=400),
+            pn.pane.Plotly(plot_v_identity(subset), width=400)
+        )
+
+        tab = pn.Column(pn.pane.Markdown(f"### Chain: {locus}"), row1, row2, width=1300)
+        tabs.append((locus, tab))
+    return pn.Tabs(*tabs)
+
+
+def add_igblast_tab_to_report(tabs, igblast_df):
+    tabs.append(("IgBlast Summary", create_igblast_summary_panel(igblast_df)))
+    return tabs
diff --git a/bin/reporter/viz/plot_style.py b/bin/reporter/viz/plot_style.py
new file mode 100644
index 0000000..c202a09
--- /dev/null
+++ b/bin/reporter/viz/plot_style.py
@@ -0,0 +1,13 @@
+import plotly.graph_objects as go
+
+def style_plotly_figure(fig: go.Figure, line_color='black', line_width=0.5) -> go.Figure:
+    fig.update_layout(
+        paper_bgcolor='rgba(0,0,0,0)',
+        # plot_bgcolor='rgba(0,0,0,0)',
+    )
+    for trace in fig.data:
+        if hasattr(trace, "marker") and isinstance(trace.marker, dict) and "line" in trace.marker:
+            trace.marker.line.update(color=line_color, width=line_width)
+        elif hasattr(trace, "marker"):
+            trace.marker.line = dict(color=line_color, width=line_width)
+    return fig
diff --git a/bin/reporter/viz/report.py b/bin/reporter/viz/report.py
new file mode 100644
index 0000000..5c083aa
--- /dev/null
+++ b/bin/reporter/viz/report.py
@@ -0,0 +1,41 @@
+import panel as pn
+from .umi_tab import create_umi_tab
+from .igblast_tab import create_igblast_summary_panel
+from .final_tab import create_final_tab, create_final_per_chain_panel
+from .chain_flow import plot_locus_distributions, build_chain_sankey
+
+
+def create_report(final_clones_df, igblast_df,
+                  umi_res=None, umi_sequences=None,
+                  vidjil_df=None,
+                  report_file_name='panel_report.html'):
+    tab_stats = pn.Column(
+        pn.pane.Markdown("## Vidjil+IgBlast Report"),
+        pn.pane.Markdown("## Chain distributions"),
+        plot_locus_distributions(vidjil_df, igblast_df),
+        pn.pane.Markdown("## Chain distribution flow: Vidjil → IgBlast"),
+        pn.pane.Plotly(build_chain_sankey(vidjil_df, igblast_df))
+    ) if vidjil_df is not None else pn.Column(
+        "Vidjil tool was not launched in this run.")
+
+    tab_igblast = create_igblast_summary_panel(igblast_df)
+
+    tab_umi = create_umi_tab(umi_res, umi_sequences) if umi_res is not None else pn.Column(
+        "UMI information was not provided.")
+
+    tab_final = create_final_tab(final_clones_df)
+
+    tab_final_per_chain = create_final_per_chain_panel(final_clones_df)
+
+    tabs = pn.Tabs(
+        ("UMI Info", tab_umi),
+        ('Vidjil + IgBlast', tab_stats),
+        ('IgBlast Stats', tab_igblast),
+        ('Final Clones', tab_final),
+        ('Final Per-Chain', tab_final_per_chain)
+    )
+    report = pn.template.FastListTemplate(
+        title="Pyigmap launch report",
+        main=[tabs]
+    )
+    report.save(report_file_name)
diff --git a/bin/reporter/viz.py b/bin/reporter/viz/umi_tab.py
similarity index 80%
rename from bin/reporter/viz.py
rename to bin/reporter/viz/umi_tab.py
index f8e4ee5..9afb8c4 100644
--- a/bin/reporter/viz.py
+++ b/bin/reporter/viz/umi_tab.py
@@ -1,18 +1,17 @@
-import panel as pn
-import matplotlib.pyplot as plt
-import seaborn as sns
-import logomaker
+import numpy as np
 import pandas as pd
+import panel as pn
 import plotly.graph_objects as go
 import plotly.express as px
-import numpy as np
-from collections import Counter
 from bokeh.plotting import figure
 from bokeh.models import HoverTool, NumeralTickFormatter
 
+from .plot_style import style_plotly_figure
+
 
 def compute_logo_matrix(sequences):
     seq_len = len(sequences[0])
+    from collections import Counter
     counts = {i: Counter() for i in range(seq_len)}
     for seq in sequences:
         for i, char in enumerate(seq):
@@ -54,7 +53,7 @@ def plotly_logo(sequences):
         showlegend=False,
         height=400
     )
-    return fig
+    return style_plotly_figure(fig)
 
 
 def bokeh_histogram(res, plot_by='read_num_pre_dedup', bins=20, x_max=None, y_range=None):
@@ -70,23 +69,43 @@ def bokeh_histogram(res, plot_by='read_num_pre_dedup', bins=20, x_max=None, y_ra
         width=600,
         tools="pan,wheel_zoom,box_zoom,reset,save"
     )
-    p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
-           fill_color="skyblue", line_color="black")
-
+    p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], fill_color="skyblue", line_color="black")
     p.x_range.end = x_max
     p.yaxis.formatter = NumeralTickFormatter(format="0")
     p.add_tools(HoverTool(tooltips=[("Bin", "@left - @right"), ("Count", "@top")], mode="mouse"))
     return p
 
 
+def bokeh_weighted_usage(res, col='read_num_pre_dedup', bins=20, x_max=None, y_range=None):
+    df = res.copy()
+    total_reads = df[col].sum()
+    df['weight'] = df[col] / total_reads
+    values = df[col].values
+    weights = df['weight'].values
+    hist, edges = np.histogram(values, bins=np.logspace(np.log10(1), np.log10(values.max() + 1), bins), weights=weights)
+    p = figure(
+        title=f"Weighted UMI Usage ({col})",
+        x_axis_type="log",
+        x_axis_label='Read Count',
+        y_axis_label='Weighted UMI Frequency',
+        y_range=y_range,
+        height=500,
+        width=600,
+        tools="pan,wheel_zoom,box_zoom,reset,save"
+    )
+    p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], fill_color="skyblue", line_color="black")
+    p.x_range.end = x_max
+    p.yaxis.formatter = NumeralTickFormatter(format="0.00")
+    p.add_tools(HoverTool(tooltips=[("Bin", "@left - @right"), ("Weighted Freq", "@top")], mode="mouse"))
+    return p
+
+
 def plot_umi_scatter(res):
     res = res.copy()
     res = res[res['read_num_pre_dedup'] > 0]
     res['delta'] = res['read_num_post_dedup'] / res['read_num_pre_dedup']
     top10 = res.sort_values('delta', ascending=False).head(int(len(res) * 0.1))
-
     max_val = max(top10['read_num_pre_dedup'].max(), top10['read_num_post_dedup'].max()) * 1.01
-
     fig = px.scatter(
         top10,
         x="read_num_pre_dedup",
@@ -100,7 +119,6 @@ def plot_umi_scatter(res):
             "read_num_post_dedup": "Post-dedup Read Count"
         }
     )
-
     fig.add_trace(go.Scatter(
         x=[0, max_val],
         y=[0, max_val],
@@ -108,43 +126,9 @@ def plot_umi_scatter(res):
         line=dict(dash='dash', color='gray'),
         showlegend=False
     ))
-
-    fig.update_layout(
-        height=600,
-        width=600,
-        # xaxis=dict(range=[np.log10(1), np.log10(max_val)], type="log"),
-        # yaxis=dict(range=[np.log10(1), np.log10(max_val)], type="log")
-    )
+    fig.update_layout(height=600, width=600)
     fig.update_traces(marker=dict(size=6, color='darkorange'))
-    return fig
-
-
-def bokeh_weighted_usage(res, col='read_num_pre_dedup', bins=20, x_max=None, y_range=None):
-    df = res.copy()
-    total_reads = df[col].sum()
-    df['weight'] = df[col] / total_reads
-    values = df[col].values
-    weights = df['weight'].values
-    hist, edges = np.histogram(values, bins=np.logspace(np.log10(1), np.log10(values.max() + 1), bins), weights=weights)
-    p = figure(
-        title=f"Weighted UMI Usage ({col})",
-        x_axis_type="log",
-        x_axis_label='Read Count',
-        y_axis_label='Weighted UMI Frequency',
-        #
-        y_range=y_range,
-        height=500,
-        width=600,
-        tools="pan,wheel_zoom,box_zoom,reset,save"
-    )
-    p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], fill_color="skyblue", line_color="black")
-
-    # p.x_range.start = x_min
-    p.x_range.end = x_max
-    p.yaxis.formatter = NumeralTickFormatter(format="0.00")
-    p.add_tools(HoverTool(tooltips=[("Bin", "@left - @right"), ("Weighted Freq", "@top")], mode="mouse"))
-    return p
-
+    return style_plotly_figure(fig)
 
 
 def plot_barcode_rank(res):
@@ -152,15 +136,12 @@ def plot_barcode_rank(res):
     df = df[df['read_num_post_dedup'] > 0]
     sorted_reads = df['read_num_post_dedup'].sort_values(ascending=False).reset_index(drop=True)
     cumulative_reads = sorted_reads.cumsum()
-
     if len(sorted_reads) > 2000:
         log_space = np.geomspace(1, len(sorted_reads), num=2000).astype(int) - 1
-        log_space = np.unique(np.clip(log_space, 0, len(sorted_reads)-1))
+        log_space = np.unique(np.clip(log_space, 0, len(sorted_reads) - 1))
         sorted_reads = sorted_reads.iloc[log_space].reset_index(drop=True)
         cumulative_reads = cumulative_reads.iloc[log_space].reset_index(drop=True)
-
     fig = go.Figure()
-
     fig.add_trace(go.Scatter(
         x=cumulative_reads,
         y=sorted_reads,
@@ -168,7 +149,6 @@ def plot_barcode_rank(res):
         line=dict(color='teal'),
         name='read_counts'
     ))
-
     fig.update_layout(
         title='UMI Rank Plot (Cumulative Reads vs UMI Read Count)',
         xaxis_title='Cumulative Reads',
@@ -178,16 +158,14 @@ def plot_barcode_rank(res):
         height=500,
         width=600
     )
+    return style_plotly_figure(fig)
 
-    return fig
 
-def create_report(res, umi_sequences, report_file_name='panel_report.html'):
+def create_umi_tab(res, umi_sequences):
     x_max = max(res['read_num_pre_dedup'].max(), res['read_num_post_dedup'].max())
     y_max_hist = max(
-        np.histogram(res['read_num_pre_dedup'],
-                     bins=np.logspace(np.log10(1), np.log10(res['read_num_pre_dedup'].max() + 1), 20))[0].max(),
-        np.histogram(res['read_num_post_dedup'],
-                     bins=np.logspace(np.log10(1), np.log10(res['read_num_post_dedup'].max() + 1), 20))[0].max()
+        np.histogram(res['read_num_pre_dedup'], bins=np.logspace(np.log10(1), np.log10(res['read_num_pre_dedup'].max() + 1), 20))[0].max(),
+        np.histogram(res['read_num_post_dedup'], bins=np.logspace(np.log10(1), np.log10(res['read_num_post_dedup'].max() + 1), 20))[0].max()
     )
     y_max_weight = max(
         res['read_num_pre_dedup'].value_counts(normalize=True).max(),
@@ -203,10 +181,8 @@ def create_report(res, umi_sequences, report_file_name='panel_report.html'):
     tab_umi_usage = pn.Column(
         pn.pane.Markdown("## UMI Usage Pre- and Post-Deduplication"),
         pn.Row(
-            pn.pane.Bokeh(
-                bokeh_histogram(res, plot_by='read_num_pre_dedup', x_max=x_max, y_range=y_range_hist)),
-            pn.pane.Bokeh(
-                bokeh_histogram(res, plot_by='read_num_post_dedup', x_max=x_max, y_range=y_range_hist))
+            pn.pane.Bokeh(bokeh_histogram(res, plot_by='read_num_pre_dedup', x_max=x_max, y_range=y_range_hist)),
+            pn.pane.Bokeh(bokeh_histogram(res, plot_by='read_num_post_dedup', x_max=x_max, y_range=y_range_hist))
         )
     )
     tab_scatter = pn.Column(
@@ -216,26 +192,17 @@ def create_report(res, umi_sequences, report_file_name='panel_report.html'):
     tab_weighted = pn.Column(
         pn.pane.Markdown("## Weighted UMI Usage by Read Count"),
         pn.Row(
-            pn.pane.Bokeh(
-                bokeh_weighted_usage(res, col='read_num_pre_dedup', x_max=x_max, y_range=y_range_weight)),
-            pn.pane.Bokeh(
-                bokeh_weighted_usage(res, col='read_num_post_dedup', x_max=x_max, y_range=y_range_weight))
+            pn.pane.Bokeh(bokeh_weighted_usage(res, col='read_num_pre_dedup', x_max=x_max, y_range=y_range_weight)),
+            pn.pane.Bokeh(bokeh_weighted_usage(res, col='read_num_post_dedup', x_max=x_max, y_range=y_range_weight))
         )
     )
     tab_rank = pn.Column(
         pn.pane.Markdown("## UMI Barcode Rank Plot"),
         pn.pane.Plotly(plot_barcode_rank(res), config={'responsive': True})
     )
-
-    tabs = pn.Tabs(
+    return pn.Tabs(
         ("Sequence Logo", tab_logo),
         ("UMI Usage", tab_umi_usage),
         ("Weighted UMI Usage", tab_weighted),
         ("UMI Usage Changes", tab_scatter),
-        ("Read Rank Plot", tab_rank)
-    )
-    report = pn.template.FastListTemplate(
-        title="UMI Analysis Report",
-        main=[tabs]
-    )
-    report.save(report_file_name)
+        ("Read Rank Plot", tab_rank))