Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
7752853
processing amelogenin from UAS sample details report [skip ci]
rnmitchell May 6, 2025
fb465c4
steps through convert can use amelogenin [skip ci]
rnmitchell May 7, 2025
7a7ad3b
fixed convert step for crappy sequences in amel and filtering amel se…
rnmitchell May 8, 2025
086c7fe
fixed typo in amel filtering function [skip ci]
rnmitchell May 13, 2025
4634d42
amelogenin now plotting correctly in pdf [skip ci]
rnmitchell May 14, 2025
992e608
fixed bug in combining reads when using custom sequence ranges [skip ci]
rnmitchell May 15, 2025
dc557e0
fixed bug with custom sequence ranges in amel [skip ci]
rnmitchell May 16, 2025
4ce279c
began implementing amel into GUI marker plots [skip ci]
rnmitchell May 19, 2025
65b878a
fixed custom range for amel [skip ci]
rnmitchell May 20, 2025
01472ee
handling samples with no sequences passing filters [skip ci]
rnmitchell May 20, 2025
ebc7fc3
fixed plotting amel in gui [skip ci]
rnmitchell May 20, 2025
c2929b1
added blank plots for missing loci [skip ci]
rnmitchell May 21, 2025
694c980
made str lists specific for each kit [skip ci]
rnmitchell May 23, 2025
f089083
added empty plots to GUI for missing markers [skip ci]
rnmitchell May 27, 2025
cd44d52
removed extra marker in powerseq list [skip ci]
rnmitchell May 28, 2025
1fb7846
removed extra marker in powerseq list [skip ci]
rnmitchell May 28, 2025
eec3ac1
began updating tests [skip ci]
rnmitchell May 28, 2025
740c5ea
updated remaining tests
rnmitchell Jun 2, 2025
1f03738
fixed formatting issues; added str lists as json file
rnmitchell Jun 4, 2025
ac814c7
fixed bug
rnmitchell Jun 4, 2025
ae3c813
simplified convert code
rnmitchell Jun 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 47 additions & 9 deletions lusSTR/cli/gui.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import pandas as pd
from pathlib import Path
import plotly.express as px
import plotly.graph_objs as go
import streamlit as st
from streamlit_option_menu import option_menu
import yaml
Expand All @@ -43,6 +44,14 @@ def get_filter_metadata_file():
filter_marker_data = json.load(fh)


def get_strlist_file():
return importlib.resources.files("lusSTR") / "data/str_lists.json"


with open(get_strlist_file(), "r") as fh:
str_lists = json.load(fh)


# ------------ Function to Generate config.yaml File ---------- #


Expand Down Expand Up @@ -197,14 +206,33 @@ def interactive_plots_allmarkers(sample_df, flagged_df):
max_yvalue = (int(math.ceil(max_reads / n)) * n) + n
increase_value = int(math.ceil((max_yvalue / 5) / n)) * n
n = 0
for marker in sample_df["Locus"].unique():
all_loci = (
str_lists["forenseq_strs"]
if st.session_state.kit == "forenseq"
else str_lists["powerseq_strs"]
)
missing_loci = [x for x in all_loci if x not in sample_df["Locus"].unique()]
for marker in all_loci:
col = cols[n]
container = col.container(border=True)
sample_locus = sample_df["SampleID"].unique() + "_" + marker
marker_df = sample_df[sample_df["Locus"] == marker].sort_values(by="CE_Allele")
sample_df = np.where(
sample_df["Locus"] == "AMELOGENIN",
np.where(sample_df["CE_Allele"] == "X", 0, 1),
sample_df["CE_Allele"],
)
sample_df["CE_Allele"] = pd.to_numeric(sample_df["CE_Allele"])
marker_df = sample_df[sample_df["Locus"] == marker].sort_values(
by=["CE_Allele", "allele_type"], ascending=[False, True]
)
if sample_locus in flagged_df["key"].values:
marker = f"⚠️{marker}⚠️"
plot = interactive_plots(marker_df, marker, max_yvalue, increase_value, all=True)
if marker in missing_loci:
marker = f"⚠️{marker}⚠️"
plot = go.Figure()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel like this plotly import pattern is an elaborate ruse so people can put "go figure" in their code 😆

plot.update_layout(title=marker)
else:
plot = interactive_plots(marker_df, marker, max_yvalue, increase_value, all=True)
container.plotly_chart(plot, use_container_width=True)
if n == 3:
n = 0
Expand Down Expand Up @@ -240,9 +268,14 @@ def interactive_plots(df, locus, ymax, increase, all=False):
)
plot.add_hline(y=at, line_width=3, line_dash="dot", line_color="gray")
plot.add_annotation(text=f"AT", x=min_x + 0.1, y=at, showarrow=False, yshift=10)
plot.update_layout(
xaxis=dict(range=[min_x, max_x], tickmode="array", tickvals=np.arange(min_x, max_x, 1))
)
if locus == "AMELOGENIN":
plot.update_layout(
xaxis=dict(tickvals=np.arange(-1, 2, 1), tickmode="array", ticktext=["", "X", "Y", ""])
)
else:
plot.update_layout(
xaxis=dict(range=[min_x, max_x], tickmode="array", tickvals=np.arange(min_x, max_x, 1))
)
if all:
plot.update_layout(
yaxis=dict(range=[0, ymax], tickmode="array", tickvals=np.arange(0, ymax, increase))
Expand Down Expand Up @@ -307,11 +340,16 @@ def interactive_setup(df1, file):
)
interactive_plots_allmarkers(sample_df, flags)
else:
plot_df = sample_df
sample_df = np.where(
sample_df["Locus"] == "AMELOGENIN",
np.where(sample_df["CE_Allele"] == "X", 0, 1),
sample_df["CE_Allele"],
)
plot_df["CE_Allele"] = pd.to_numeric(plot_df["CE_Allele"])
locus_key = f"{sample}_{locus}"
if locus_key not in st.session_state:
st.session_state[locus_key] = sample_df[sample_df["Locus"] == locus].reset_index(
drop=True
)
st.session_state[locus_key] = plot_df[plot_df["Locus"] == locus].reset_index(drop=True)
Type = [
"Deleted",
"Typed",
Expand Down
18 changes: 18 additions & 0 deletions lusSTR/data/filters.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,22 @@
{
"AMELOGENIN": {
"MinimumNumberReadsForDynamicThresholds": 650,
"DetectionThresholdStaticCount": 10,
"DetectionThresholdDynamicPercent": 0,
"DetectionThresholdUse": "Static",
"AnalyticalThresholdStaticCount": 20,
"AnalyticalThresholdDynamicPercent": 0.017,
"AnalyticalThresholdUse": "Both",
"StochasticThresholdStaticCount": 20,
"StochasticThresholdDynamicPercent": 0.017,
"StochasticThresholdUse": "Both",
"MinimumHeterozygousBalanceThresholdDynamicPercent": 0.50,
"SameSizeThresholdDynamicPercent": 0,
"StutterThresholdDynamicPercent": 0,
"StutterForwardThresholdDynamicPercent": 0,
"Intercept": 0,
"Slope": 0
},
"CSF1PO": {
"MinimumNumberReadsForDynamicThresholds": 650,
"DetectionThresholdStaticCount": 10,
Expand Down
101 changes: 101 additions & 0 deletions lusSTR/data/str_lists.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
{

"powerseq_strs" : [
"AMELOGENIN",
"CSF1PO",
"D10S1248",
"D12S391",
"D13S317",
"D16S539",
"D18S51",
"D19S433",
"D1S1656",
"D21S11",
"D22S1045",
"D2S1338",
"D2S441",
"D3S1358",
"D5S818",
"D7S820",
"D8S1179",
"FGA",
"PENTA D",
"PENTA E",
"TH01",
"TPOX",
"VWA"
],
"forenseq_strs" : [
"AMELOGENIN",
"CSF1PO",
"D10S1248",
"D12S391",
"D13S317",
"D16S539",
"D17S1301",
"D18S51",
"D19S433",
"D1S1656",
"D20S482",
"D21S11",
"D22S1045",
"D2S1338",
"D2S441",
"D3S1358",
"D4S2408",
"D5S818",
"D6S1043",
"D7S820",
"D8S1179",
"D9S1122",
"FGA",
"PENTA D",
"PENTA E",
"TH01",
"TPOX",
"VWA"
],
"powerseq_ystrs" : [
"DYS19",
"DYS385A-B",
"DYS389II",
"DYS390",
"DYS391",
"DYS392",
"DYS393",
"DYS437",
"DYS438",
"DYS439",
"DYS448",
"DYS456",
"DYS458",
"DYS481",
"DYS533",
"DYS549",
"DYS570",
"DYS576",
"DYS635",
"DYS643",
"Y-GATA-H4"
],
"forenseq_ystrs" : [
"DYS19",
"DYS385A-B",
"DYS389II",
"DYS390",
"DYS391",
"DYS392",
"DYS437",
"DYS438",
"DYS439",
"DYS448",
"DYS481",
"DYS533",
"DYS549",
"DYS570",
"DYS576",
"DYS635",
"DYS643",
"Y-GATA-H4"
]
}
19 changes: 19 additions & 0 deletions lusSTR/data/str_markers.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,23 @@
{
"AMELOGENIN": {
"BasesToSubtract": 0,
"NumRepeats": 1,
"Repeats": [
"AAAGTG"
],
"NumBasesToSeparate": 0,
"ReverseCompNeeded": "No",
"LUS": "",
"Sec": "",
"Tert": "",
"Foren_5": 26,
"Foren_3": 37,
"Power_5": 10,
"Power_3": 37,
"Custom_5": 0,
"Custom_3": 0,
"Alleles": ["X", "Y"]
},
"CSF1PO": {
"BasesToSubtract": 0,
"NumRepeats": 1,
Expand Down
57 changes: 47 additions & 10 deletions lusSTR/scripts/filter_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,20 +28,57 @@ def get_filter_metadata_file():

def filters(locus_allele_info, locus, locus_reads, datatype, brack_col):
metadata = filter_marker_data[locus]
if len(locus_allele_info) == 1:
locus_allele_info = single_allele_thresholds(metadata, locus_reads, locus_allele_info)
if locus == "AMELOGENIN":
locus_allele_info = filter_amel(metadata, locus_allele_info, locus_reads)
Comment on lines -31 to +32
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's nothing necessarily wrong with how you've updated this code, but we could keep the nesting complexity to a minimum with something like this, right?

if locus == "AMELOGENIN":
   # ...
elif len(locus_allele_info) == 1:
   # ...
else:
   # ...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was just trying to avoid have to repeat code (line 34 would have to be in both blocks).

else:
locus_allele_info, locus_reads = multiple_allele_thresholds(
metadata, locus_reads, locus_allele_info
)
locus_allele_info = ce_filtering(
locus_allele_info, locus_reads, metadata, datatype, brack_col
)
if datatype != "ce":
locus_allele_info = same_size_filter(locus_allele_info, metadata, datatype)
locus_allele_info["CE_Allele"] = locus_allele_info["CE_Allele"].astype(float)
if len(locus_allele_info) == 1:
locus_allele_info = single_allele_thresholds(metadata, locus_reads, locus_allele_info)
else:
locus_allele_info, locus_reads = multiple_allele_thresholds(
metadata, locus_reads, locus_allele_info
)
locus_allele_info = ce_filtering(
locus_allele_info, locus_reads, metadata, datatype, brack_col
)
if datatype != "ce":
locus_allele_info = same_size_filter(locus_allele_info, metadata, datatype)
return locus_allele_info


def filter_amel(metadata, amel_df, locus_reads):
for filter in ["Detection", "Analytical"]:
use = metadata[f"{filter}ThresholdUse"]
count = metadata[f"{filter}ThresholdStaticCount"]
perc = metadata[f"{filter}ThresholdDynamicPercent"]
thresh_perc = round(perc * locus_reads, 1)
if (
use.lower() == "dynamic"
and locus_reads < metadata["MinimumNumberReadsForDynamicThresholds"]
):
use = "static"
if use.lower() == "both":
thresh = thresh_perc if thresh_perc >= count else count
elif use.lower() == "static":
thresh = count
elif use.lower() == "dynamic":
thresh = thresh_perc
if filter == "Detection":
amel_dt = amel_df[amel_df["Reads"] >= thresh].reset_index(drop=True)
locus_reads = amel_df["Reads"].sum()
else:
for i in range(len(amel_dt)):
al_reads = amel_dt.loc[i, "Reads"]
if al_reads < thresh:
amel_dt.loc[i, ["allele_type", "perc_noise"]] = [
"BelowAT",
round(al_reads / locus_reads, 3),
]
else:
amel_dt.loc[i, "allele_type"] = "Typed"
return amel_dt


def single_allele_thresholds(metadata, locus_reads, single_all_df):
if thresholds("Detection", metadata, locus_reads, single_all_df["Reads"][0])[1] is False:
single_all_df = pd.DataFrame()
Expand Down
Loading