Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 42 additions & 28 deletions bin/reporter/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,47 @@
import pandas as pd
from logger import set_logger


from utils import get_read_to_umi_mapping, get_consensus_group_size_per_read
from viz import create_report
from bin.reporter.utils import (
parse_UMI_data,
parse_vidjil_fasta,
parse_igblast_tsv,
load_final_outputs
)
from bin.reporter.viz.report import create_report

logger = set_logger(name=__file__)


def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser()
parser.add_argument('--in-fq1-pyumi', help='Input fastq.gz file after pyumi step, SE or PE pair 1',
parser.add_argument('--in-fq1-pyumi',
help='Input fastq.gz file after pyumi step, SE or PE pair 1. Needed if UMIs are specified',
required=False)
parser.add_argument('--in-fq2-pyumi',
help='Input fastq.gz file after pyumi step, PE pair 2. Needed if UMIs are specified')

parser.add_argument('--in-fq1-calib',
help='Input fastq.gz file after calib step, SE or PE pair 2. Needed if UMIs are specified',
required=False)
parser.add_argument('--in-fq2-calib',
help='Input fastq.gz file after calib step, PE pair 2. Needed if UMIs are specified')

parser.add_argument('--umi-reverse', action='store_true')

parser.add_argument('--vidjil-fasta',
help='Input file with vidjil algo results. Needed if this step was used')

parser.add_argument('--igblast-tsv',
help='File with IgBlast results',
required=True)
parser.add_argument('--in-fq2-pyumi', help='Input fastq.gz file after pyumi step, PE pair 2')

parser.add_argument('--in-fq1-calib', help='Input fastq.gz file after calib step, SE or PE pair 2',
parser.add_argument('--final-stat',
help='File with final results (json stat file)',
required=True)
parser.add_argument('--in-fq2-calib', help='Input fastq.gz file after calib step, PE pair 2')

parser.add_argument('--umi-reverse', action='store_true')
parser.add_argument('--final-clones',
help='File with IgBlast results (clone data file)',
required=True)

parser.add_argument('--report-file')

Expand All @@ -30,28 +54,18 @@ def main() -> None:
args = parse_args()
logger.info(f"Starting program with the following arguments: {args}")

pyumi_data_path = args.in_fq1_pyumi if not args.umi_reverse else args.in_fq2_pyumi
calib_data_path = args.in_fq1_calib if not args.umi_reverse else args.in_fq2_calib

logger.info(f"Started reading initial data from {pyumi_data_path}")
umi_to_count_mapping_pre, id_to_umi, sequences = get_read_to_umi_mapping(pyumi_data_path)

logger.info(f"Started reading calib data from {calib_data_path}")
umi_to_count_mapping_post = get_consensus_group_size_per_read(calib_data_path, id_to_umi)

logger.info(f"Created dataset")
res = pd.DataFrame({'umi': umi_to_count_mapping_pre.keys(),
'read_num_pre_dedup': umi_to_count_mapping_pre.values()}).merge(
pd.DataFrame({'umi': umi_to_count_mapping_post.keys(),
'read_num_post_dedup': umi_to_count_mapping_post.values()}), how='outer'
).fillna(0)

res['umi'] = res['umi'].apply(lambda x: x.decode('ascii'))
res_umi, sequences_umi = parse_UMI_data(args)
vidjil_df = parse_vidjil_fasta(args.vidjil_fasta)
igblast_df = parse_igblast_tsv(args.igblast_tsv)
stats, final_clones = load_final_outputs(args.final_stat, args.final_clones)

logger.info('Started report creation...')
create_report(res, sequences, report_file_name=args.report_file)
create_report(vidjil_df=vidjil_df, igblast_df=igblast_df,
umi_res=res_umi, umi_sequences=sequences_umi,
final_clones_df=final_clones,
report_file_name=args.report_file)
logger.info(f'Report saved successfully in {args.report_file}.')


if __name__ == "__main__":
main()
main()
79 changes: 72 additions & 7 deletions bin/reporter/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
import gzip
from collections import defaultdict, Counter
import pandas as pd
import re
from logger import set_logger
import numpy as np
logger = set_logger(name=__file__)



def get_consensus_group_size_per_read(fastq_file: str, id_to_umi: dict[int, str]) -> dict[str, int]:
Expand All @@ -10,12 +16,12 @@ def get_consensus_group_size_per_read(fastq_file: str, id_to_umi: dict[int, str]
while header := f.readline().strip().decode("utf-8"): # read header string
if not header:
break
f.readline()
f.readline()
f.readline()
_ = f.readline()
_ = f.readline()
_ = f.readline()

group_ids = header.split('\t')[1].split(';')[:-1]
consensus_group_size = len(group_ids) # @0 145607;265853;563279;
consensus_group_size = len(group_ids)

umi = Counter([id_to_umi[int(x)] for x in group_ids]).most_common(1)[0][0]
umi_to_group_size_per_read[umi] += consensus_group_size
Expand All @@ -35,13 +41,72 @@ def get_read_to_umi_mapping(fastq_file: str, umi_len: int = 12) -> dict[str, int
break
sequence = f.readline()
sequences.append(sequence)
f.readline()
f.readline()
_ = f.readline()
_ = f.readline()

umi = sequence[:umi_len]

umi_to_count[umi] += 1
id_to_umi[read_id] = umi
read_id += 1

return umi_to_count, id_to_umi, sequences
return umi_to_count, id_to_umi, sequences

def parse_vidjil_fasta(path: str) -> pd.DataFrame:
if path is None:
return None
logger.info(f"Started reading vidjil data from {path}")
headers = []
with gzip.open(path, "rt") as f:
for line in f:
if line.startswith(">"):
header = line[1:].strip()
sequence_id = header.split()[0]
match = re.search(r"(\w+) SEG_", header)
locus_from_seg = match.group(1) if match else None
headers.append({
"header": header,
"sequence_id": sequence_id,
"locus": locus_from_seg
})
return pd.DataFrame(headers)


def parse_UMI_data(args):
if args.in_fq1_pyumi is None:
return None, None
pyumi_data_path = args.in_fq1_pyumi if not args.umi_reverse else args.in_fq2_pyumi
calib_data_path = args.in_fq1_calib if not args.umi_reverse else args.in_fq2_calib

logger.info(f"Started reading initial data from {pyumi_data_path}")
umi_to_count_mapping_pre, id_to_umi, sequences = get_read_to_umi_mapping(pyumi_data_path)

logger.info(f"Started reading calib data from {calib_data_path}")
umi_to_count_mapping_post = get_consensus_group_size_per_read(calib_data_path, id_to_umi)

logger.info(f"Created dataset for UMI processing")
res = pd.DataFrame({'umi': umi_to_count_mapping_pre.keys(),
'read_num_pre_dedup': umi_to_count_mapping_pre.values()}).merge(
pd.DataFrame({'umi': umi_to_count_mapping_post.keys(),
'read_num_post_dedup': umi_to_count_mapping_post.values()}), how='outer'
).fillna(0)
return res, sequences

def parse_igblast_tsv(path: str) -> pd.DataFrame:
logger.info(f"Started reading igblast data from {path}")
return pd.read_csv(path, sep="\t", comment="#", dtype=str)


def load_final_outputs(stat_path, clone_path):
import json
logger.info(f"Started reading final pyigmap data from {stat_path}, {clone_path}")

with open(stat_path) as f:
stats = json.load(f)
clone_df = pd.read_csv(clone_path, sep="\t")
return stats, clone_df

def shannon_entropy(probs, base=np.e):
probs = np.array(probs)
probs = probs[probs > 0]
return -np.sum(probs * np.log(probs)) / np.log(base)
Empty file added bin/reporter/viz/__init__.py
Empty file.
50 changes: 50 additions & 0 deletions bin/reporter/viz/chain_flow.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import pandas as pd
import panel as pn
import plotly.graph_objects as go
from bokeh.plotting import figure
from .plot_style import style_plotly_figure

def plot_locus_barplot(df: pd.DataFrame, locus_column: str, title: str, color: str = "steelblue"):
counts = df[locus_column].value_counts().sort_values(ascending=False)
categories = list(counts.index)
p = figure(x_range=categories, height=450, width=600, title=title, toolbar_location=None, tools="")
p.vbar(x=counts.index, top=counts.values, width=0.8, color=color)
p.xgrid.grid_line_color = None
p.y_range.start = 0
p.xaxis.major_label_orientation = "vertical"
return p, counts.max()

def plot_locus_distributions(vidjil_df: pd.DataFrame, igblast_df: pd.DataFrame):
p1, max1 = plot_locus_barplot(vidjil_df, "locus", "Vidjil: chain distribution", color="steelblue")
p2, max2 = plot_locus_barplot(igblast_df, "locus", "IgBlast: chain distribution", color="darkorange")
y_max = max(max1, max2) * 1.05
p1.y_range.end = y_max
p2.y_range.end = y_max
return pn.Row(p1, p2)

def build_chain_sankey(vidjil_df, igblast_df):
merged = pd.DataFrame({
"sequence_id": vidjil_df["sequence_id"],
"vidjil_locus": vidjil_df["locus"]
}).merge(
igblast_df[["sequence_id", "locus"]], on="sequence_id", how="inner"
).rename(columns={"locus": "igblast_locus"})

counts = merged.groupby(["vidjil_locus", "igblast_locus"]).size().reset_index(name="value")
counts["source"] = "Vidjil: " + counts["vidjil_locus"]
counts["target"] = "IgBlast: " + counts["igblast_locus"]

all_labels = list(pd.unique(counts[["source", "target"]].values.ravel()))
label_map = {l: i for i, l in enumerate(all_labels)}

fig = go.Figure(data=[go.Sankey(
node=dict(label=all_labels),
link=dict(
source=counts["source"].map(label_map),
target=counts["target"].map(label_map),
value=counts["value"]
)
)])

fig.update_layout(width=1200, height=500)
return style_plotly_figure(fig)
93 changes: 93 additions & 0 deletions bin/reporter/viz/final_tab.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import pandas as pd
import numpy as np
import panel as pn
import plotly.graph_objects as go
import plotly.express as px

from .plot_style import style_plotly_figure
from bin.reporter.utils import shannon_entropy


def plot_final_diversity(clones_df: pd.DataFrame):
clone_sizes = clones_df['duplicate_count']
freqs = clone_sizes / clone_sizes.sum()

def hill_diversity(freqs, q):
if q == 1:
return np.exp(shannon_entropy(freqs, base=np.e))
return np.power((freqs ** q).sum(), 1 / (1 - q))

hill_qs = np.linspace(0, 5, 100)
hill_values = [hill_diversity(freqs, q) for q in hill_qs]

fig = go.Figure()
fig.add_trace(go.Scatter(x=hill_qs, y=hill_values, mode='lines', line=dict(color='teal'), name='Hill'))
fig.update_layout(title="Hill Diversity Curve", xaxis_title="q", yaxis_title="Hill Diversity",
width=600, height=350, showlegend=False)
return style_plotly_figure(fig)


def plot_final_distributions(clones_df: pd.DataFrame):
clones_df = clones_df.copy()
clones_df['clone_size'] = clones_df['duplicate_count']
clones_df['cdr3_len'] = clones_df['cdr3_aa'].str.len()
clones_df['log_pgen'] = np.log10(clones_df['pgen'].replace(0, np.nan)).fillna(-100)

p1 = px.histogram(clones_df, x='log_pgen', nbins=20, title='log10(Pgen)', color_discrete_sequence=['slateblue'])
p1.update_layout(width=400, height=350, xaxis_title='log10(pgen)', yaxis_title='Count')

p2 = px.histogram(clones_df, x='cdr3_len', nbins=15, title='CDR3 Length', color_discrete_sequence=['darkgreen'])
p2.update_layout(width=400, height=350, xaxis_title='Length', yaxis_title='Count')

p3 = px.histogram(clones_df, x='clone_size', nbins=10, title='Clone Sizes', color_discrete_sequence=['darkorange'])
p3.update_layout(width=400, height=350, xaxis_title='Clone size', yaxis_title='Frequency')

return pn.Row(
pn.pane.Plotly(style_plotly_figure(p1)),
pn.pane.Plotly(style_plotly_figure(p2)),
pn.pane.Plotly(style_plotly_figure(p3))
)


def create_final_tab(clones_df):
dist = plot_final_distributions(clones_df)
hill = plot_final_diversity(clones_df)
return pn.Column("## Final Clone Summary", dist, hill)


def create_final_per_chain_panel(df: pd.DataFrame):
tabs = []
for locus in sorted(df["locus"].dropna().unique()):
subset = df[df["locus"] == locus]

top_n = 15
row_genes = pn.Row(
pn.pane.Plotly(plot_top_genes(subset, "v_call", "Top V genes", top_n), width=400),
pn.pane.Plotly(plot_top_genes(subset, "d_call", "Top D genes", top_n),
width=400) if "d_call" in subset.columns and subset["d_call"].notna().any() else pn.Spacer(width=0),
pn.pane.Plotly(plot_top_genes(subset, "j_call", "Top J genes", top_n), width=400)
)

top_clones = subset.nlargest(10, "duplicate_count")
fig_clones = go.Figure(go.Bar(
y=top_clones['cdr3_aa'],
x=top_clones['duplicate_count'],
orientation='h',
marker_color='indianred'))
fig_clones.update_layout(title="Top 10 Clones by Count", xaxis_title="Count", width=600, height=350, showlegend=False)

diversity = plot_final_diversity(subset)
second_summary = plot_final_distributions(subset)

row_summary = pn.Row(pn.pane.Plotly(style_plotly_figure(fig_clones), width=500), pn.pane.Plotly(diversity, width=500))

tab = pn.Column(pn.pane.Markdown(f"### Chain: {locus}"), row_genes, second_summary, row_summary, width=1300)
tabs.append((locus, tab))
return pn.Tabs(*tabs)


def plot_top_genes(df, gene_column, title, top_n=15):
counts = df[gene_column].dropna().apply(lambda x: x.split(',')[0]).value_counts().nlargest(top_n)
fig = go.Figure(go.Bar(y=counts.index, x=counts.values, marker_color="indianred", orientation='h'))
fig.update_layout(title=title, yaxis_title=gene_column, xaxis_title="Count", height=650, showlegend=False)
return style_plotly_figure(fig)
Loading