diff --git a/lusSTR/cli/gui.py b/lusSTR/cli/gui.py index 1971a6a9..29b95ffb 100644 --- a/lusSTR/cli/gui.py +++ b/lusSTR/cli/gui.py @@ -22,6 +22,7 @@ import pandas as pd from pathlib import Path import plotly.express as px +import plotly.graph_objs as go import streamlit as st from streamlit_option_menu import option_menu import yaml @@ -43,6 +44,14 @@ def get_filter_metadata_file(): filter_marker_data = json.load(fh) +def get_strlist_file(): + return importlib.resources.files("lusSTR") / "data/str_lists.json" + + +with open(get_strlist_file(), "r") as fh: + str_lists = json.load(fh) + + # ------------ Function to Generate config.yaml File ---------- # @@ -197,14 +206,33 @@ def interactive_plots_allmarkers(sample_df, flagged_df): max_yvalue = (int(math.ceil(max_reads / n)) * n) + n increase_value = int(math.ceil((max_yvalue / 5) / n)) * n n = 0 - for marker in sample_df["Locus"].unique(): + all_loci = ( + str_lists["forenseq_strs"] + if st.session_state.kit == "forenseq" + else str_lists["powerseq_strs"] + ) + missing_loci = [x for x in all_loci if x not in sample_df["Locus"].unique()] + for marker in all_loci: col = cols[n] container = col.container(border=True) sample_locus = sample_df["SampleID"].unique() + "_" + marker - marker_df = sample_df[sample_df["Locus"] == marker].sort_values(by="CE_Allele") + sample_df = np.where( + sample_df["Locus"] == "AMELOGENIN", + np.where(sample_df["CE_Allele"] == "X", 0, 1), + sample_df["CE_Allele"], + ) + sample_df["CE_Allele"] = pd.to_numeric(sample_df["CE_Allele"]) + marker_df = sample_df[sample_df["Locus"] == marker].sort_values( + by=["CE_Allele", "allele_type"], ascending=[False, True] + ) if sample_locus in flagged_df["key"].values: marker = f"⚠️{marker}⚠️" - plot = interactive_plots(marker_df, marker, max_yvalue, increase_value, all=True) + if marker in missing_loci: + marker = f"⚠️{marker}⚠️" + plot = go.Figure() + plot.update_layout(title=marker) + else: + plot = interactive_plots(marker_df, marker, max_yvalue, increase_value, all=True) container.plotly_chart(plot, use_container_width=True) if n == 3: n = 0 @@ -240,9 +268,14 @@ def interactive_plots(df, locus, ymax, increase, all=False): ) plot.add_hline(y=at, line_width=3, line_dash="dot", line_color="gray") plot.add_annotation(text=f"AT", x=min_x + 0.1, y=at, showarrow=False, yshift=10) - plot.update_layout( - xaxis=dict(range=[min_x, max_x], tickmode="array", tickvals=np.arange(min_x, max_x, 1)) - ) + if locus == "AMELOGENIN": + plot.update_layout( + xaxis=dict(tickvals=np.arange(-1, 2, 1), tickmode="array", ticktext=["", "X", "Y", ""]) + ) + else: + plot.update_layout( + xaxis=dict(range=[min_x, max_x], tickmode="array", tickvals=np.arange(min_x, max_x, 1)) + ) if all: plot.update_layout( yaxis=dict(range=[0, ymax], tickmode="array", tickvals=np.arange(0, ymax, increase)) @@ -307,11 +340,16 @@ def interactive_setup(df1, file): ) interactive_plots_allmarkers(sample_df, flags) else: + plot_df = sample_df + sample_df = np.where( + sample_df["Locus"] == "AMELOGENIN", + np.where(sample_df["CE_Allele"] == "X", 0, 1), + sample_df["CE_Allele"], + ) + plot_df["CE_Allele"] = pd.to_numeric(plot_df["CE_Allele"]) locus_key = f"{sample}_{locus}" if locus_key not in st.session_state: - st.session_state[locus_key] = sample_df[sample_df["Locus"] == locus].reset_index( - drop=True - ) + st.session_state[locus_key] = plot_df[plot_df["Locus"] == locus].reset_index(drop=True) Type = [ "Deleted", "Typed", diff --git a/lusSTR/data/filters.json b/lusSTR/data/filters.json index 64f06b4b..d241505c 100644 --- a/lusSTR/data/filters.json +++ b/lusSTR/data/filters.json @@ -1,4 +1,22 @@ { + "AMELOGENIN": { + "MinimumNumberReadsForDynamicThresholds": 650, + "DetectionThresholdStaticCount": 10, + "DetectionThresholdDynamicPercent": 0, + "DetectionThresholdUse": "Static", + "AnalyticalThresholdStaticCount": 20, + "AnalyticalThresholdDynamicPercent": 0.017, + "AnalyticalThresholdUse": "Both", + "StochasticThresholdStaticCount": 20, + "StochasticThresholdDynamicPercent": 0.017, + "StochasticThresholdUse": "Both", + "MinimumHeterozygousBalanceThresholdDynamicPercent": 0.50, + "SameSizeThresholdDynamicPercent": 0, + "StutterThresholdDynamicPercent": 0, + "StutterForwardThresholdDynamicPercent": 0, + "Intercept": 0, + "Slope": 0 + }, "CSF1PO": { "MinimumNumberReadsForDynamicThresholds": 650, "DetectionThresholdStaticCount": 10, diff --git a/lusSTR/data/str_lists.json b/lusSTR/data/str_lists.json new file mode 100644 index 00000000..e1d54894 --- /dev/null +++ b/lusSTR/data/str_lists.json @@ -0,0 +1,101 @@ +{ + + "powerseq_strs" : [ + "AMELOGENIN", + "CSF1PO", + "D10S1248", + "D12S391", + "D13S317", + "D16S539", + "D18S51", + "D19S433", + "D1S1656", + "D21S11", + "D22S1045", + "D2S1338", + "D2S441", + "D3S1358", + "D5S818", + "D7S820", + "D8S1179", + "FGA", + "PENTA D", + "PENTA E", + "TH01", + "TPOX", + "VWA" + ], + "forenseq_strs" : [ + "AMELOGENIN", + "CSF1PO", + "D10S1248", + "D12S391", + "D13S317", + "D16S539", + "D17S1301", + "D18S51", + "D19S433", + "D1S1656", + "D20S482", + "D21S11", + "D22S1045", + "D2S1338", + "D2S441", + "D3S1358", + "D4S2408", + "D5S818", + "D6S1043", + "D7S820", + "D8S1179", + "D9S1122", + "FGA", + "PENTA D", + "PENTA E", + "TH01", + "TPOX", + "VWA" + ], + "powerseq_ystrs" : [ + "DYS19", + "DYS385A-B", + "DYS389II", + "DYS390", + "DYS391", + "DYS392", + "DYS393", + "DYS437", + "DYS438", + "DYS439", + "DYS448", + "DYS456", + "DYS458", + "DYS481", + "DYS533", + "DYS549", + "DYS570", + "DYS576", + "DYS635", + "DYS643", + "Y-GATA-H4" + ], + "forenseq_ystrs" : [ + "DYS19", + "DYS385A-B", + "DYS389II", + "DYS390", + "DYS391", + "DYS392", + "DYS437", + "DYS438", + "DYS439", + "DYS448", + "DYS481", + "DYS533", + "DYS549", + "DYS570", + "DYS576", + "DYS635", + "DYS643", + "Y-GATA-H4" + ] +} \ No newline at end of file diff --git a/lusSTR/data/str_markers.json b/lusSTR/data/str_markers.json index 38c613d5..cf030cb4 100644 --- a/lusSTR/data/str_markers.json +++ b/lusSTR/data/str_markers.json @@ -1,4 +1,23 @@ { + "AMELOGENIN": { + "BasesToSubtract": 0, + "NumRepeats": 1, + "Repeats": [ + "AAAGTG" + ], + "NumBasesToSeparate": 0, + "ReverseCompNeeded": "No", + "LUS": "", + "Sec": "", + "Tert": "", + "Foren_5": 26, + "Foren_3": 37, + "Power_5": 10, + "Power_3": 37, + "Custom_5": 0, + "Custom_3": 0, + "Alleles": ["X", "Y"] + }, "CSF1PO": { "BasesToSubtract": 0, "NumRepeats": 1, diff --git a/lusSTR/scripts/filter_settings.py b/lusSTR/scripts/filter_settings.py index e639a33b..88430dc6 100644 --- a/lusSTR/scripts/filter_settings.py +++ b/lusSTR/scripts/filter_settings.py @@ -28,20 +28,57 @@ def get_filter_metadata_file(): def filters(locus_allele_info, locus, locus_reads, datatype, brack_col): metadata = filter_marker_data[locus] - if len(locus_allele_info) == 1: - locus_allele_info = single_allele_thresholds(metadata, locus_reads, locus_allele_info) + if locus == "AMELOGENIN": + locus_allele_info = filter_amel(metadata, locus_allele_info, locus_reads) else: - locus_allele_info, locus_reads = multiple_allele_thresholds( - metadata, locus_reads, locus_allele_info - ) - locus_allele_info = ce_filtering( - locus_allele_info, locus_reads, metadata, datatype, brack_col - ) - if datatype != "ce": - locus_allele_info = same_size_filter(locus_allele_info, metadata, datatype) + locus_allele_info["CE_Allele"] = locus_allele_info["CE_Allele"].astype(float) + if len(locus_allele_info) == 1: + locus_allele_info = single_allele_thresholds(metadata, locus_reads, locus_allele_info) + else: + locus_allele_info, locus_reads = multiple_allele_thresholds( + metadata, locus_reads, locus_allele_info + ) + locus_allele_info = ce_filtering( + locus_allele_info, locus_reads, metadata, datatype, brack_col + ) + if datatype != "ce": + locus_allele_info = same_size_filter(locus_allele_info, metadata, datatype) return locus_allele_info +def filter_amel(metadata, amel_df, locus_reads): + for filter in ["Detection", "Analytical"]: + use = metadata[f"{filter}ThresholdUse"] + count = metadata[f"{filter}ThresholdStaticCount"] + perc = metadata[f"{filter}ThresholdDynamicPercent"] + thresh_perc = round(perc * locus_reads, 1) + if ( + use.lower() == "dynamic" + and locus_reads < metadata["MinimumNumberReadsForDynamicThresholds"] + ): + use = "static" + if use.lower() == "both": + thresh = thresh_perc if thresh_perc >= count else count + elif use.lower() == "static": + thresh = count + elif use.lower() == "dynamic": + thresh = thresh_perc + if filter == "Detection": + amel_dt = amel_df[amel_df["Reads"] >= thresh].reset_index(drop=True) + locus_reads = amel_df["Reads"].sum() + else: + for i in range(len(amel_dt)): + al_reads = amel_dt.loc[i, "Reads"] + if al_reads < thresh: + amel_dt.loc[i, ["allele_type", "perc_noise"]] = [ + "BelowAT", + round(al_reads / locus_reads, 3), + ] + else: + amel_dt.loc[i, "allele_type"] = "Typed" + return amel_dt + + def single_allele_thresholds(metadata, locus_reads, single_all_df): if thresholds("Detection", metadata, locus_reads, single_all_df["Reads"][0])[1] is False: single_all_df = pd.DataFrame() diff --git a/lusSTR/scripts/marker.py b/lusSTR/scripts/marker.py index ab91ae6f..95aa2baa 100644 --- a/lusSTR/scripts/marker.py +++ b/lusSTR/scripts/marker.py @@ -63,7 +63,10 @@ def __init__(self, locus, sequence, software, custom=False, kit="forenseq"): @property def repeat_size(self): - return len(self.data["LUS"]) + if self.data["LUS"] != "": + return len(self.data["LUS"]) + else: + return 1 @property def repeats(self): @@ -355,6 +358,87 @@ def summary(self): ] +class STRMarker_Amelogenin(STRMarker): + @property + def forward_sequence(self): + if self.software == "uas": + return self.sequence + front, back = self._uas_bases_to_trim() + if len(self.sequence) == 0: + back = None + else: + back *= -1 + if self.sequence[front:back] == "": + return "" + else: + return self.sequence[front:back] + + @property + def custom_sequence(self): + if self.custom: + custom_front, custom_back = self._uas_bases_to_trim() + if custom_back == 0: + custom_back = None + else: + custom_back *= -1 + if self.sequence[custom_front:custom_back] == "": + return "" + else: + return self.sequence[custom_front:custom_back] + else: + return None + + @property + def canonical(self): + if self.uas_sequence == "AAAGTG": + return "Y" + elif self.uas_sequence == "": + return "X" + else: + return self.uas_sequence + + @property + def convert(self): + if self.forward_sequence == "": + return "" + else: + return self.forward_sequence + + @property + def custom_brack(self): + if self.forward_sequence == "": + return "" + else: + return "NA" + + @property + def summary(self): + if self.uas_sequence == "": + return [ + "", + "", + "", + "", + "NA", + "NA", + "X", + "NA", + "NA", + ] + else: + return [ + self.uas_sequence, + self.forward_sequence, + self.custom_sequence, + self.convert, + self.convert, + self.custom_brack, + self.canonical, + "NA", + "NA", + ] + + class STRMarker_D8S1179(STRMarker): @property def flank_5p(self): @@ -1742,6 +1826,7 @@ def flank_5p(self): def STRMarkerObject(locus, sequence, software, custom=False, kit="forenseq"): constructors = { + "AMELOGENIN": STRMarker_Amelogenin, "D8S1179": STRMarker_D8S1179, "D13S317": STRMarker_D13S317, "D20S482": STRMarker_D20S482, diff --git a/lusSTR/tests/data/LUSPlus_stutter_test/LUSPlus_sequence_info.csv b/lusSTR/tests/data/LUSPlus_stutter_test/LUSPlus_sequence_info.csv index 5a2541dd..84c079d7 100644 --- a/lusSTR/tests/data/LUSPlus_stutter_test/LUSPlus_sequence_info.csv +++ b/lusSTR/tests/data/LUSPlus_stutter_test/LUSPlus_sequence_info.csv @@ -1,24 +1,24 @@ SampleID,Locus,CE_Allele,LUS_Plus,Reads,allele_type,parent_allele1,parent_allele2,allele1_ref_reads,allele2_ref_reads,perc_noise,perc_stutter -Sample1,D4S2408,10.0,10_10_0,1022,Typed,,,,,, -Sample1,D4S2408,9.0,9_9_0,116,-1_stutter/+1_stutter,10_10_0,8_8_0,1022.0,1050.0,, -Sample1,D4S2408,8.0,8_8_0,1050,Typed,,,,,, -Sample1,D8S1179,14.0,14_12_1_0,869,Typed,,,,,, -Sample1,D8S1179,13.0,13_11_1_0,184,-1_stutter,14_12_1_0,,869.0,,,0.212 -Sample1,D8S1179,12.0,12_10_1_0,37,-2_stutter,14_12_1_0,,869.0,,,0.201 -Sample1,D9S1122,13.0,13_11,948,Typed,,,,,, -Sample1,D9S1122,12.0,12_10,108,-1_stutter,13_11,,948.0,,,0.114 -Sample1,D9S1122,11.0,11_11,991,Typed,,,,,, -Sample1,D9S1122,10.0,10_10,87,-1_stutter,11_11,,991.0,,,0.088 -Sample1,FGA,23.0,23_15_3_0,1436,Typed,,,,,, -Sample1,FGA,22.0,22_14_3_0,262,-1_stutter,23_15_3_0,,1436.0,,,0.182 -Sample1,FGA,21.0,21_13_3_0,48,BelowAT,,,,,0.013, -Sample1,FGA,20.0,20_12_3_0,1750,Typed,,,,,, -Sample1,FGA,18.0,18_10_3_0,181,Typed,,,,,, -Sample1,FGA,17.0,17_9_3_0,15,BelowAT,,,,,0.004, -Sample1,PENTA D,15.0,15_15,50,Typed,,,,,, -Sample1,PENTA D,13.0,13_13,1000,Typed,,,,,, +Sample1,D4S2408,10,10_10_0,1022,Typed,,,,,, +Sample1,D4S2408,9,9_9_0,116,-1_stutter/+1_stutter,10_10_0,8_8_0,1022.0,1050.0,, +Sample1,D4S2408,8,8_8_0,1050,Typed,,,,,, +Sample1,D8S1179,14,14_12_1_0,869,Typed,,,,,, +Sample1,D8S1179,13,13_11_1_0,184,-1_stutter,14_12_1_0,,869.0,,,0.212 +Sample1,D8S1179,12,12_10_1_0,37,-2_stutter,14_12_1_0,,869.0,,,0.201 +Sample1,D9S1122,13,13_11,948,Typed,,,,,, +Sample1,D9S1122,12,12_10,108,-1_stutter,13_11,,948.0,,,0.114 +Sample1,D9S1122,11,11_11,991,Typed,,,,,, +Sample1,D9S1122,10,10_10,87,-1_stutter,11_11,,991.0,,,0.088 +Sample1,FGA,23,23_15_3_0,1436,Typed,,,,,, +Sample1,FGA,22,22_14_3_0,262,-1_stutter,23_15_3_0,,1436.0,,,0.182 +Sample1,FGA,21,21_13_3_0,48,BelowAT,,,,,0.013, +Sample1,FGA,20,20_12_3_0,1750,Typed,,,,,, +Sample1,FGA,18,18_10_3_0,181,Typed,,,,,, +Sample1,FGA,17,17_9_3_0,15,BelowAT,,,,,0.004, +Sample1,PENTA D,15,15_15,50,Typed,,,,,, +Sample1,PENTA D,13,13_13,1000,Typed,,,,,, Sample1,PENTA E,7.0,7_7,505,Typed,,,,,, -Sample1,TH01,7.0,7_7,2197,Typed,,,,,, -Sample1,TH01,6.0,6_6,1632,Typed,,,,,, -Sample1,TH01,5.0,5_5,66,BelowAT,,,,,0.017, +Sample1,TH01,7,7_7,2197,Typed,,,,,, +Sample1,TH01,6,6_6,1632,Typed,,,,,, +Sample1,TH01,5,5_5,66,BelowAT,,,,,0.017, Sample1,TPOX,11.0,11_11,15,BelowAT,,,,,1.0, diff --git a/lusSTR/tests/data/NGS_stutter_test/Sample1_nofilter.csv b/lusSTR/tests/data/NGS_stutter_test/Sample1_nofilter.csv index 7531c6f3..b83fce36 100644 --- a/lusSTR/tests/data/NGS_stutter_test/Sample1_nofilter.csv +++ b/lusSTR/tests/data/NGS_stutter_test/Sample1_nofilter.csv @@ -1,28 +1,28 @@ Locus,CE Allele,Allele Seq,Reads -D4S2408,8.0,ATCTATCTATCTATCTATCTATCTATCTATCT,1000 -D4S2408,9.0,ATCTATCTATCTATCTATCTATCTATCTATCTATCT,1357 -D4S2408,10.0,ATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT,900 -D8S1179,12.0,TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTA,26 -D8S1179,12.0,TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,11 -D8S1179,13.0,TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,95 -D8S1179,13.0,TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,89 -D8S1179,14.0,TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,739 -D8S1179,14.0,TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,130 -D9S1122,10.0,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,87 -D9S1122,11.0,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,991 -D9S1122,12.0,TAGATCGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,108 -D9S1122,13.0,TAGATCGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,948 -FGA,17.0,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,15 -FGA,18.0,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,181 -FGA,20.0,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,1750 -FGA,21.0,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,48 -FGA,22.0,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,262 -FGA,23.0,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,1436 -PentaD,13.0,AAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,1000 -PentaD,15.0,AAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,50 -PentaE,7.0,AAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,505 -TH01,5.0,AATGAATGAATGAATGAATG,66 -TH01,6.0,AATGAATGAATGAATGAATGAATG,1632 -TH01,7.0,AATGAATGAATGAATGAATGAATGAATG,2197 -TPOX,11.0,AATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATG,15 -vWA,16.0,TCTATCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCCATCTA,6 +D4S2408,8,ATCTATCTATCTATCTATCTATCTATCTATCT,1000 +D4S2408,9,ATCTATCTATCTATCTATCTATCTATCTATCTATCT,1357 +D4S2408,10,ATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT,900 +D8S1179,12,TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTA,26 +D8S1179,12,TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,11 +D8S1179,13,TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,95 +D8S1179,13,TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,89 +D8S1179,14,TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,739 +D8S1179,14,TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,130 +D9S1122,10,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,87 +D9S1122,11,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,991 +D9S1122,12,TAGATCGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,108 +D9S1122,13,TAGATCGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,948 +FGA,17,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,15 +FGA,18,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,181 +FGA,20,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,1750 +FGA,21,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,48 +FGA,22,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,262 +FGA,23,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,1436 +PentaD,13,AAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,1000 +PentaD,15,AAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,50 +PentaE,7,AAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,505 +TH01,5,AATGAATGAATGAATGAATG,66 +TH01,6,AATGAATGAATGAATGAATGAATG,1632 +TH01,7,AATGAATGAATGAATGAATGAATGAATG,2197 +TPOX,11,AATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATG,15 +vWA,16,TCTATCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCCATCTA,6 diff --git a/lusSTR/tests/data/STRaitRazor_output_test_A001.csv b/lusSTR/tests/data/STRaitRazor_output_test_A001.csv index 219ee03f..c4a2a09a 100644 --- a/lusSTR/tests/data/STRaitRazor_output_test_A001.csv +++ b/lusSTR/tests/data/STRaitRazor_output_test_A001.csv @@ -1,4 +1,6 @@ Locus,Total_Reads,Sequence,SampleID,Project,Analysis +Amelogenin,226,TAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAGTGGTCCCAATTTTACAGTTCCTACCAT,A001,NA,NA +Amelogenin,162,TAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAGTGGTCCTGATTTTACAGTTCCTACCAC,A001,NA,NA CSF1PO,547,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A001,NA,NA CSF1PO,25,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A001,NA,NA CSF1PO,7,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A001,NA,NA diff --git a/lusSTR/tests/data/STRait_Razor_test_output.csv b/lusSTR/tests/data/STRait_Razor_test_output.csv index 57449926..959433cf 100644 --- a/lusSTR/tests/data/STRait_Razor_test_output.csv +++ b/lusSTR/tests/data/STRait_Razor_test_output.csv @@ -1,4 +1,6 @@ Locus,Total_Reads,Sequence,SampleID,Project,Analysis +Amelogenin,226,TAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAGTGGTCCCAATTTTACAGTTCCTACCAT,A001,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,162,TAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAGTGGTCCTGATTTTACAGTTCCTACCAC,A001,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,547,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A001,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,25,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A001,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,7,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A001,STRait_Razor_test_output,STRait_Razor_test_output @@ -610,6 +612,8 @@ vWA,2,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAG vWA,2,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGATCAAT,A001,STRait_Razor_test_output,STRait_Razor_test_output vWA,2,AATACATAGGATGGATGGATAGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGATCAAT,A001,STRait_Razor_test_output,STRait_Razor_test_output vWA,2,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGATCAAT,A001,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,249,TAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAGTGGTCCCAATTTTACAGTTCCTACCAT,A002,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,171,TAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAGTGGTCCTGATTTTACAGTTCCTACCAC,A002,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,498,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A002,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,402,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A002,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,41,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A002,STRait_Razor_test_output,STRait_Razor_test_output @@ -1450,6 +1454,8 @@ vWA,2,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATTGATAG vWA,2,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGGCAGACAGATAGATCAAT,A002,STRait_Razor_test_output,STRait_Razor_test_output vWA,2,AATACATAGGATGGGTGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGATCAAT,A002,STRait_Razor_test_output,STRait_Razor_test_output vWA,2,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGATAAAT,A002,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,313,TAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAGTGGTCCCAATTTTACAGTTCCTACCAT,A003,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,167,TAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAGTGGTCCTGATTTTACAGTTCCTACCAC,A003,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,696,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A003,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,35,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A003,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,10,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A003,STRait_Razor_test_output,STRait_Razor_test_output @@ -2313,6 +2319,8 @@ vWA,3,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAG vWA,3,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGATCAAT,A003,STRait_Razor_test_output,STRait_Razor_test_output vWA,2,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGATAGATCAAT,A003,STRait_Razor_test_output,STRait_Razor_test_output vWA,2,AATACATAGGATGGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGATAGATCAAT,A003,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,178,TAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAGTGGTCCTGATTTTACAGTTCCTACCAC,A004,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,135,TAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAGTGGTCCCAATTTTACAGTTCCTACCAT,A004,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,469,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A004,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,381,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A004,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,22,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A004,STRait_Razor_test_output,STRait_Razor_test_output @@ -3173,6 +3181,11 @@ vWA,2,AATACATAGGATGGATGGATAGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAG vWA,2,AATACATAGGATGGATGGATGGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGATCAAT,A004,STRait_Razor_test_output,STRait_Razor_test_output vWA,2,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGACAGATAGATCAAT,A004,STRait_Razor_test_output,STRait_Razor_test_output vWA,2,AATACATAGGATGGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGATCAAT,A004,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,322,TAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAGTGGTCCCAATTTTACAGTTCCTACCAT,A005,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,299,TAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAGTGGTCCTGATTTTACAGTTCCTACCAC,A005,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,3,TAGTGGGTGGATTCATCATCCCAAATAAAGTGGTTTCTCAAGTGGTCCCAATTTTACAGTTCCTACCAT,A005,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,2,TAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAGTGGTCCTGATTTTTCAGTTCCTACCAC,A005,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,2,TAGTGGATGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAGTGGTCCCAATTTTACAGTTCCTACCAT,A005,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,817,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A005,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,28,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A005,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,13,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A005,STRait_Razor_test_output,STRait_Razor_test_output @@ -4101,6 +4114,9 @@ vWA,2,AATACATAGGATGGATAGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAG vWA,2,AATACATAGGATGGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGATCAAT,A005,STRait_Razor_test_output,STRait_Razor_test_output vWA,2,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATGGATAGATAGATAGATAGATAGATAGACAGACAGACAGATAGATCAAT,A005,STRait_Razor_test_output,STRait_Razor_test_output vWA,2,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGATAGTTCAAT,A005,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,255,TAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAGTGGTCCCAATTTTACAGTTCCTACCAT,A006,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,196,TAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAGTGGTCCTGATTTTACAGTTCCTACCAC,A006,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,2,TAGCGTGTTGATTCTTTATCCCAGATGTTTCTCAAGTGGTCCTGATTTTACAGTTCCTACCAC,A006,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,429,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A006,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,390,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A006,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,59,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A006,STRait_Razor_test_output,STRait_Razor_test_output @@ -4892,6 +4908,8 @@ vWA,2,AATACATAGGATGGATGGATAGATGGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAG vWA,2,AATACATAGGATGGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGATCAAT,A006,STRait_Razor_test_output,STRait_Razor_test_output vWA,2,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAAACAGACAGATAGATCAAT,A006,STRait_Razor_test_output,STRait_Razor_test_output vWA,2,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGACAGATAGATCAAT,A006,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,230,TAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAGTGGTCCTGATTTTACAGTTCCTACCAC,A007,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,212,TAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAGTGGTCCCAATTTTACAGTTCCTACCAT,A007,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,864,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A007,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,57,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A007,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,8,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A007,STRait_Razor_test_output,STRait_Razor_test_output @@ -5885,6 +5903,10 @@ vWA,3,AATACATAGGATGGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAG vWA,2,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGATCAAT,A007,STRait_Razor_test_output,STRait_Razor_test_output vWA,2,AATACATAGGATGGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGATCAAT,A007,STRait_Razor_test_output,STRait_Razor_test_output vWA,2,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGATAGATCAAT,A007,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,385,TAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAGTGGTCCTGATTTTACAGTTCCTACCAC,A008,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,259,TAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAGTGGTCCCAATTTTACAGTTCCTACCAT,A008,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,3,TAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAGTGGTCCTGATTTTACCGTTCCTACCAC,A008,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,2,TAGTGGGTGGATACTTCATCCCAAATAAAGTGGTTTCTCAAGTGGTCCCAATTTTACAGTTCCTACCAT,A008,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,523,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A008,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,483,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A008,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,30,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A008,STRait_Razor_test_output,STRait_Razor_test_output @@ -6808,6 +6830,8 @@ vWA,2,AATACATAGAATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAG vWA,2,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATGGATAGATAGATAGACAGACAGACAGACAGATAGATCAAT,A008,STRait_Razor_test_output,STRait_Razor_test_output vWA,2,AATACATAGGATGGATGGATAGATGGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGATCAAT,A008,STRait_Razor_test_output,STRait_Razor_test_output vWA,2,GATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGATCAAT,A008,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,317,TAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAGTGGTCCCAATTTTACAGTTCCTACCAT,A009,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,194,TAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAGTGGTCCTGATTTTACAGTTCCTACCAC,A009,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,407,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A009,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,338,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A009,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,16,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A009,STRait_Razor_test_output,STRait_Razor_test_output @@ -7722,6 +7746,9 @@ vWA,2,AATACATAGGATGGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAG vWA,2,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGGCAGATAGATCAAT,A009,STRait_Razor_test_output,STRait_Razor_test_output vWA,2,AATACATAGGATGGATGGATAGATGGACAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGATCAAT,A009,STRait_Razor_test_output,STRait_Razor_test_output vWA,2,AATACATAGGATGGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGATCAAT,A009,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,227,TAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAGTGGTCCCAATTTTACAGTTCCTACCAT,A010,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,143,TAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAGTGGTCCTGATTTTACAGTTCCTACCAC,A010,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,2,TAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAGTGGTCCCAATTTTATAGTTCCTACCAT,A010,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,664,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A010,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,39,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A010,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,2,CGTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A010,STRait_Razor_test_output,STRait_Razor_test_output @@ -8718,6 +8745,8 @@ vWA,3,GATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAG vWA,3,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGATAGATCAAT,A010,STRait_Razor_test_output,STRait_Razor_test_output vWA,2,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGATAGACAGACAGATAGATCAAT,A010,STRait_Razor_test_output,STRait_Razor_test_output vWA,2,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGATAGATCAAT,A010,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,276,TAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAGTGGTCCTGATTTTACAGTTCCTACCAC,A011,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,228,TAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAGTGGTCCCAATTTTACAGTTCCTACCAT,A011,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,449,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A011,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,272,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A011,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,28,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A011,STRait_Razor_test_output,STRait_Razor_test_output @@ -9677,6 +9706,11 @@ vWA,3,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAG vWA,2,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGATAGATAGATCAAT,A011,STRait_Razor_test_output,STRait_Razor_test_output vWA,2,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACGGACAGACAGATAGATCAAT,A011,STRait_Razor_test_output,STRait_Razor_test_output vWA,2,AATACATAGGATGGATGGATAGATGGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGATCAAT,A011,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,418,TAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAGTGGTCCCAATTTTACAGTTCCTACCAT,A012,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,339,TAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAGTGGTCCTGATTTTACAGTTCCTACCAC,A012,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,2,TAGTGTGTTGATTCTCTATCCCAGATGTTTCTCAAGTGGTCCTGATTTTACAGTTCCTACCAC,A012,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,2,TAGTGGGTGGATTCTTCGTCCCAAATAAAGTGGTTTCTCAAGTGGTCCCAATTTTACAGTTCCTACCAT,A012,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,2,TAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTATCAAGTGGTCCCAATTTTACAGTTCCTACCAT,A012,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,1131,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A012,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,43,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A012,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,13,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A012,STRait_Razor_test_output,STRait_Razor_test_output diff --git a/lusSTR/tests/data/UAS_bulk_test.csv b/lusSTR/tests/data/UAS_bulk_test.csv index 88663214..064b8952 100644 --- a/lusSTR/tests/data/UAS_bulk_test.csv +++ b/lusSTR/tests/data/UAS_bulk_test.csv @@ -1,4 +1,6 @@ Locus,Reads,Repeat Sequence,SampleID,Project,Analysis +Amelogenin,143,,Positive Control,Project1,Analysis1 +Amelogenin,283,AAAGTG,Positive Control,Project1,Analysis1 D1S1656,33,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGGTGTGTGTGTG,Positive Control,Project1,Analysis1 D1S1656,13,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATGTGTGTGTG,Positive Control,Project1,Analysis1 D1S1656,231,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGGTGTGTGTGTG,Positive Control,Project1,Analysis1 @@ -127,6 +129,8 @@ D22S1045,13,ATTATTATTATTATTATTATTATTATTATTATTACTATTATT,Positive Control,Project1 D22S1045,146,ATTATTATTATTATTATTATTATTATTATTATTATTACTATTATT,Positive Control,Project1,Analysis1 D22S1045,1746,ATTATTATTATTATTATTATTATTATTATTATTATTATTACTATTATT,Positive Control,Project1,Analysis1 D22S1045,27,ATTATTATTATTATTATTATTATTATTATTATTATTATTATTACTATTATT,Positive Control,Project1,Analysis1 +Amelogenin,143,,Positive Control2,Project1,Analysis1 +Amelogenin,283,AAAGTG,Positive Control2,Project1,Analysis1 D1S1656,33,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGGTGTGTGTGTG,Positive Control2,Project1,Analysis1 D1S1656,13,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATGTGTGTGTG,Positive Control2,Project1,Analysis1 D1S1656,231,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGGTGTGTGTGTG,Positive Control2,Project1,Analysis1 diff --git a/lusSTR/tests/data/genemarker/genemarker_test.csv b/lusSTR/tests/data/genemarker/genemarker_test.csv index 142ac4b4..944ea126 100644 --- a/lusSTR/tests/data/genemarker/genemarker_test.csv +++ b/lusSTR/tests/data/genemarker/genemarker_test.csv @@ -1,4 +1,6 @@ Locus,Total_Reads,Sequence,SampleID,Project,Analysis +Amelogenin,14189,TCAGCTATGAGGTAATTTTTCTCTTTACTAATTTTGACCATTGTTTGCGTTAACAATGCCCTGGGCTCTGTAAAGAATAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAGTGGTCCTGATTTTACAGTTCCTACCACCAGCTTCCCA,2800M_strresults_filtered,NA,NA +Amelogenin,11986,TCAGCTATGAGGTAATTTTTCTCTTTACTAATTTTGATCACTGTTTGCATTAGCAGTCCCCTGGGCTCTGTAAAGAATAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAGTGGTCCCAATTTTACAGTTCCTACCATCAGCTTCCCA,2800M_strresults_filtered,NA,NA PentaE,6733,TAATGATTACATAACATACATGTGTGTAAAGTGCTTAGTATCATGATTGATACATGGAAAGAATTCTCTTATTTGGGTTATTAATTGAGAAAACTCCTTACAATTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTGAGAC,2800M_strresults_filtered,NA,NA PentaE,4746,TAATGATTACATAACATACATGTGTGTAAAGTGCTTAGTATCATGATTGATACATGGAAAGAATTCTCTTATTTGGGTTATTAATTGAGAAAACTCCTTACAATTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTGAGAC,2800M_strresults_filtered,NA,NA D18S51,602,AGGCTGCAGTGAGCCATGTTCATGCCACTGCACTTCACTCTGAGTGACAAATTGAGACCTTGTCTCAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAAAGAGAGAGGAAAGAAAGAGAAAAAGAAAAGAAATAGTAGCAACTGTTATTGTA,2800M_strresults_filtered,NA,NA diff --git a/lusSTR/tests/data/genemarker/genemarker_test.txt b/lusSTR/tests/data/genemarker/genemarker_test.txt index 3b3fc202..326eb682 100644 --- a/lusSTR/tests/data/genemarker/genemarker_test.txt +++ b/lusSTR/tests/data/genemarker/genemarker_test.txt @@ -69,3 +69,5 @@ SampleID Project Analysis Locus UAS_Output_Sequence Forward_Strand_Sequence UAS_ 2800M_strresults_filtered NA NA D10S1248 GGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAA GGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAA [GGAA]12 [GGAA]12 12 12_12 12_12 741 2800M_strresults_filtered NA NA CSF1PO AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT ATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT [AGAT]12 [ATCT]12 12 12_12 12_12_0 14044 2800M_strresults_filtered NA NA CSF1PO AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT ATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT [AGAT]11 [ATCT]11 11 11_11 11_11_0 1047 +2800M_strresults_filtered NA NA AMELOGENIN GGTAATTTTTCTCTTTACTAATTTTGACCATTGTTTGCGTTAACAATGCCCTGGGCTCTGTAAAGAATAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAG GGTAATTTTTCTCTTTACTAATTTTGACCATTGTTTGCGTTAACAATGCCCTGGGCTCTGTAAAGAATAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAG GGTAATTTTTCTCTTTACTAATTTTGACCATTGTTTGCGTTAACAATGCCCTGGGCTCTGTAAAGAATAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAG GGTAATTTTTCTCTTTACTAATTTTGACCATTGTTTGCGTTAACAATGCCCTGGGCTCTGTAAAGAATAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAG GGTAATTTTTCTCTTTACTAATTTTGACCATTGTTTGCGTTAACAATGCCCTGGGCTCTGTAAAGAATAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAG NA NA 14189 +2800M_strresults_filtered NA NA AMELOGENIN GGTAATTTTTCTCTTTACTAATTTTGATCACTGTTTGCATTAGCAGTCCCCTGGGCTCTGTAAAGAATAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAG GGTAATTTTTCTCTTTACTAATTTTGATCACTGTTTGCATTAGCAGTCCCCTGGGCTCTGTAAAGAATAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAG GGTAATTTTTCTCTTTACTAATTTTGATCACTGTTTGCATTAGCAGTCCCCTGGGCTCTGTAAAGAATAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAG GGTAATTTTTCTCTTTACTAATTTTGATCACTGTTTGCATTAGCAGTCCCCTGGGCTCTGTAAAGAATAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAG GGTAATTTTTCTCTTTACTAATTTTGATCACTGTTTGCATTAGCAGTCCCCTGGGCTCTGTAAAGAATAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAG NA NA 11986 diff --git a/lusSTR/tests/data/genemarker/genemarker_test_flanks.txt b/lusSTR/tests/data/genemarker/genemarker_test_flanks.txt index 9740c817..a445bbc4 100644 --- a/lusSTR/tests/data/genemarker/genemarker_test_flanks.txt +++ b/lusSTR/tests/data/genemarker/genemarker_test_flanks.txt @@ -71,3 +71,5 @@ SampleID Project Analysis Locus Reads CE_Allele Full_Sequence 5_Flank_Bracketed_ 2800M_strresults_filtered NA NA D10S1248 741 12 CCCCAGGACCAATCTGGTCACAAACATATTAATGAATTGAACAAATGAGTGAGTGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAA CCCC AGGA CCAA TCTG GTCA CAAA CATA TTAA TGAA TT GAAC AAAT [GAGT]2 [GGAA]12 2800M_strresults_filtered NA NA CSF1PO 14044 12 CTAAGTACTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTTCTATCTATGAAGGCAGTTACTGTTAATATCTTCATTTTACAGGTAGGAAAACTGAGACACAGGGTGGTTAGCAACCTGCTAGTCCTTGGCAGACTCAG CTA AGTA CT TCCT [ATCT]12 A [ATCT]3 T [CTAT]2 GAAG GCAG TTAC TGTT AATA TCTT CATT TTAC AGGT AGGA AAAC TGAG ACAC AGGG TGGT TAG CA ACCT GCTA GTCC TTGG CAGA CTCA G 2800M_strresults_filtered NA NA CSF1PO 1047 11 CTAAGTACTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTTCTATCTATGAAGGCAGTTACTGTTAATATCTTCATTTTACAGGTAGGAAAACTGAGACACAGGGTGGTTAGCAACCTGCTAGTCCTTGGCAGACTCAG CTA AGTA CT TCCT [ATCT]11 A [ATCT]3 T [CTAT]2 GAAG GCAG TTAC TGTT AATA TCTT CATT TTAC AGGT AGGA AAAC TGAG ACAC AGGG TGGT TAG CA ACCT GCTA GTCC TTGG CAGA CTCA G +2800M_strresults_filtered NA NA AMELOGENIN 14189 GGTAATTTTTCTCTTTACTAATTTTGACCATTGTTTGCGTTAACAATGCCCTGGGCTCTGTAAAGAATAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAG TCAGCTATGAGGTAATTTTTCTCTTTACTAATTTTGACCATTGTTTGCGTTAACAATGCCCTGGGCTCTGTAAAGAATAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAGTGGTCCTGATTTTACAGTTCCTACCACCAGCTTCCCA T C A G C T A T G A GGTAATTTTTCTCTTTACTAATTTTGACCATTGTTTGCGTTAACAATGCCCTGGGCTCTGTAAAGAATAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAG T [G]2 T [C]2 T G A [T]4 A C A G [T]2 [C]2 T A [C]2 A [C]2 A G C [T]2 [C]3 A Possible indel or partial sequence +2800M_strresults_filtered NA NA AMELOGENIN 11986 GGTAATTTTTCTCTTTACTAATTTTGATCACTGTTTGCATTAGCAGTCCCCTGGGCTCTGTAAAGAATAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAG TCAGCTATGAGGTAATTTTTCTCTTTACTAATTTTGATCACTGTTTGCATTAGCAGTCCCCTGGGCTCTGTAAAGAATAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAGTGGTCCCAATTTTACAGTTCCTACCATCAGCTTCCCA T C A G C T A T G A GGTAATTTTTCTCTTTACTAATTTTGATCACTGTTTGCATTAGCAGTCCCCTGGGCTCTGTAAAGAATAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAG T [G]2 T [C]3 [A]2 [T]4 A C A G [T]2 [C]2 T A [C]2 A T C A G C [T]2 [C]3 A Possible indel or partial sequence diff --git a/lusSTR/tests/data/lusstr_output.csv b/lusSTR/tests/data/lusstr_output.csv index a53ea100..cce2ce0c 100644 --- a/lusSTR/tests/data/lusstr_output.csv +++ b/lusSTR/tests/data/lusstr_output.csv @@ -1,4 +1,6 @@ Locus,Reads,Repeat Sequence,SampleID,Project,Analysis +Amelogenin,143,,Positive Control,Project1,Analysis1 +Amelogenin,283,AAAGTG,Positive Control,Project1,Analysis1 D1S1656,33,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGGTGTGTGTGTG,Positive Control,Project1,Analysis1 D1S1656,13,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATGTGTGTGTG,Positive Control,Project1,Analysis1 D1S1656,231,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGGTGTGTGTGTG,Positive Control,Project1,Analysis1 diff --git a/lusSTR/tests/data/lusstr_output.txt b/lusSTR/tests/data/lusstr_output.txt index d86c37dd..852445e8 100644 --- a/lusSTR/tests/data/lusstr_output.txt +++ b/lusSTR/tests/data/lusstr_output.txt @@ -127,3 +127,5 @@ Positive_Control Project1 Analysis1 D10S1248 GGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGA Positive_Control Project1 Analysis1 CSF1PO AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT ATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT [AGAT]12 [ATCT]12 12 12_12 12_12_0 702 Positive_Control Project1 Analysis1 CSF1PO AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT ATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT [AGAT]11 [ATCT]11 11 11_11 11_11_0 29 Positive_Control Project1 Analysis1 CSF1PO AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT ATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT [AGAT]13 [ATCT]13 13 13_13 13_13_0 11 +Positive_Control Project1 Analysis1 AMELOGENIN AAAGTG AAAGTG AAAGTG AAAGTG Y NA NA 283 +Positive_Control Project1 Analysis1 AMELOGENIN NA X NA NA 143 diff --git a/lusSTR/tests/data/testformat.csv b/lusSTR/tests/data/testformat.csv index a53ea100..cce2ce0c 100644 --- a/lusSTR/tests/data/testformat.csv +++ b/lusSTR/tests/data/testformat.csv @@ -1,4 +1,6 @@ Locus,Reads,Repeat Sequence,SampleID,Project,Analysis +Amelogenin,143,,Positive Control,Project1,Analysis1 +Amelogenin,283,AAAGTG,Positive Control,Project1,Analysis1 D1S1656,33,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGGTGTGTGTGTG,Positive Control,Project1,Analysis1 D1S1656,13,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATGTGTGTGTG,Positive Control,Project1,Analysis1 D1S1656,231,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGGTGTGTGTGTG,Positive Control,Project1,Analysis1 diff --git a/lusSTR/workflows/strs.smk b/lusSTR/workflows/strs.smk index 3e2ad4e7..fb77fa4d 100644 --- a/lusSTR/workflows/strs.smk +++ b/lusSTR/workflows/strs.smk @@ -150,7 +150,8 @@ rule filter: filters=config["nofilters"], strand=config["strand"], custom=config["custom_ranges"], - sex=config["sex"] + sex=config["sex"], + kit=config["kit"] script: lusSTR.wrapper("filter") diff --git a/lusSTR/wrappers/convert.py b/lusSTR/wrappers/convert.py index cc79317b..49a7062d 100644 --- a/lusSTR/wrappers/convert.py +++ b/lusSTR/wrappers/convert.py @@ -58,8 +58,6 @@ def format_table(input, software, kit="forenseq", custom=False): locus = "PENTA E" if locus == "DYS385A/B" or locus == "DYS385": locus = "DYS385A-B" - if locus == "AMELOGENIN": - continue metadata = str_marker_data[locus] if kit == "forenseq": remove_5p = metadata["Foren_5"] @@ -67,7 +65,15 @@ def format_table(input, software, kit="forenseq", custom=False): else: remove_5p = metadata["Power_5"] remove_3p = metadata["Power_3"] - if len(sequence) <= (remove_5p + remove_3p) and software != "uas": + if custom: + if metadata["Custom_5"] < 0: + remove_5p = remove_5p - metadata["Custom_5"] + if metadata["Custom_3"] < 0: + remove_3p = remove_3p - metadata["Custom_3"] + locus_min_length = remove_5p + remove_3p + len(metadata["LUS"]) + if locus == "AMELOGENIN": + locus_min_length -= 1 + if software != "uas" and len(sequence) < locus_min_length: flank_summary = [ sampleid, project, @@ -218,8 +224,35 @@ def check_vwa(marker, sequence, software, custom): return new_marker -def combine_reads(table, columns): - comb_table = table.groupby(columns[:-1], as_index=False)["Reads"].sum() +def combine_reads(table, columns, custom=False): + if custom: + comb_table = ( + table.groupby( + [ + "SampleID", + "Project", + "Analysis", + "Locus", + "Custom_Range_Sequence", + "Custom_Bracketed_Notation", + "CE_Allele", + ] + ) + .agg( + { + "UAS_Output_Sequence": lambda x: ", ".join(x), + "Forward_Strand_Sequence": lambda x: ", ".join(x), + "UAS_Output_Bracketed_Notation": lambda x: ", ".join(x), + "Forward_Strand_Bracketed_Notation": lambda x: ", ".join(x), + "LUS": lambda x: ", ".join(x), + "LUS_Plus": lambda x: ", ".join(x), + "Reads": "sum", + } + ) + .reset_index() + ) + else: + comb_table = table.groupby(columns[:-1], as_index=False)["Reads"].sum() sorted = sort_table(comb_table) return sorted @@ -237,7 +270,7 @@ def remove_columns(column_list, remove_list): return column_list -def create_custom_outputtable(columns, table): +def create_custom_outputtable(columns, table, custom): remove_list = [ "UAS_Output_Sequence", "Forward_Strand_Sequence", @@ -273,7 +306,9 @@ def main(input, out, kit, software, sex, nocombine, custom): sex_final_table = combine_reads(sex_final_table, sex_columns) sex_final_table.to_csv(f"{full_table_name}_sexloci.txt", sep="\t", index=False) if custom: - sex_table_custom = create_custom_outputtable(sex_columns, sex_final_table) + sex_table_custom = create_custom_outputtable( + sex_columns, sex_final_table, custom=True + ) sex_table_custom.to_csv(f"{output_name}_sexloci.txt", index=False, sep="\t") else: sex_final_table.to_csv(f"{output_name}_sexloci.txt", sep="\t", index=False) @@ -287,7 +322,9 @@ def main(input, out, kit, software, sex, nocombine, custom): autosomal_final_table = combine_reads(autosomal_final_table, columns) autosomal_final_table.to_csv(f"{full_table_name}.txt", sep="\t", index=False) if custom: - custom_table_comb = create_custom_outputtable(columns, autosomal_final_table) + custom_table_comb = create_custom_outputtable( + columns, autosomal_final_table, custom=True + ) custom_table_comb.to_csv(out, sep="\t", index=False) else: autosomal_final_table.to_csv(out, sep="\t", index=False) diff --git a/lusSTR/wrappers/filter.py b/lusSTR/wrappers/filter.py index 782b6be2..812713e2 100644 --- a/lusSTR/wrappers/filter.py +++ b/lusSTR/wrappers/filter.py @@ -27,61 +27,6 @@ import sys -strs = [ - "CSF1PO", - "D10S1248", - "D12S391", - "D13S317", - "D16S539", - "D17S1301", - "D18S51", - "D19S433", - "D1S1656", - "D20S482", - "D21S11", - "D22S1045", - "D2S1338", - "D2S441", - "D3S1358", - "D4S2408", - "D5S818", - "D6S1043", - "D7S820", - "D8S1179", - "D9S1122", - "FGA", - "PENTA D", - "PENTA E", - "TH01", - "TPOX", - "VWA", -] - -ystrs = [ - "DYS19", - "DYS385A-B", - "DYS389II", - "DYS390", - "DYS391", - "DYS392", - "DYS393", - "DYS437", - "DYS438", - "DYS439", - "DYS448", - "DYS456", - "DYS458", - "DYS481", - "DYS533", - "DYS549", - "DYS570", - "DYS576", - "DYS635", - "DYS643", - "Y-GATA-H4", -] - - def get_filter_metadata_file(): return importlib.resources.files("lusSTR") / "data/filters.json" @@ -90,9 +35,19 @@ def get_filter_metadata_file(): filter_marker_data = json.load(fh) -def process_strs(dict_loc, datatype, seq_col, brack_col): +def get_strlist_file(): + return importlib.resources.files("lusSTR") / "data/str_lists.json" + + +with open(get_strlist_file(), "r") as fh: + str_lists = json.load(fh) + + +def process_strs(dict_loc, datatype, seq_col, brack_col, kit): final_df = pd.DataFrame() flags_df = pd.DataFrame() + strs = str_lists["powerseq_strs"] if kit == "powerseq" else str_lists["forenseq_strs"] + ystrs = str_lists["powerseq_ystrs"] if kit == "powerseq" else str_lists["forenseq_ystrs"] for key, value in dict_loc.items(): data = dict_loc[key].reset_index(drop=True) if datatype == "ce": @@ -146,27 +101,22 @@ def process_strs(dict_loc, datatype, seq_col, brack_col): filtered_df = filtered_df.replace({"nan": None}) final_df = pd.concat([final_df, filtered_df]) flags_df = pd.concat([flags_df, flags(filtered_df, datatype)]) - if datatype == "ce" or datatype == "ngs": - try: - final_df = final_df.astype({"CE_Allele": "float64", "Reads": "int"}) - except KeyError: - final_df = None return final_df, flags_df -def EFM_output(profile, outfile, profile_type, data_type, col, sex, separate=False): +def EFM_output(profile, outfile, profile_type, data_type, col, sex, kit, separate=False): if profile_type == "reference": profile = profile.query("allele_type == 'Typed'") else: profile = profile.query("allele_type != ['BelowAT', 'Deleted']") - efm_profile = populate_efm_profile(profile, data_type, col, sex) + efm_profile = populate_efm_profile(profile, data_type, col, sex, kit) if separate: write_sample_specific_efm_profiles(efm_profile, profile_type, data_type, outfile) else: write_aggregate_efm_profile(efm_profile, profile_type, data_type, outfile) -def populate_efm_profile(profile, data_type, colname, sex): +def populate_efm_profile(profile, data_type, colname, sex, kit): if data_type == "ce": prof_col = "CE_Allele" elif data_type == "lusplus": @@ -189,6 +139,8 @@ def populate_efm_profile(profile, data_type, colname, sex): allele_heights[row.SampleID][row.Locus][row.Allele] = int(row.Reads) max_num_alleles = determine_max_num_alleles(allele_heights) reformatted_profile = list() + strs = str_lists["powerseq_strs"] if kit == "powerseq" else str_lists["forenseq_strs"] + ystrs = str_lists["powerseq_ystrs"] if kit == "powerseq" else str_lists["forenseq_ystrs"] for sampleid, loci in allele_heights.items(): for locusid, alleles in loci.items(): allele_list, height_list = list(), list() @@ -213,7 +165,8 @@ def populate_efm_profile(profile, data_type, colname, sex): for col in height_columns: efm_profile[col] = efm_profile[col].astype("Int64") efm_profile = efm_profile.sort_values(by=["SampleName", "Marker"]) - return efm_profile + efm_profile_noamel = efm_profile[efm_profile["Marker"] != "AMELOGENIN"] + return efm_profile_noamel def write_sample_specific_efm_profiles(efm_profile, profile_type, data_type, outdir): @@ -262,6 +215,7 @@ def determine_max_num_alleles(allele_heights): def STRmix_output(profile, outdir, profile_type, data_type, seq_col): + profile = profile[profile["Locus"] != "AMELOGENIN"] Path(outdir).mkdir(parents=True, exist_ok=True) if profile_type == "reference": filtered_df = profile.query("allele_type == 'Typed'") @@ -359,21 +313,24 @@ def format_ref_table(new_rows, sample_data, datatype): return sort_df -def marker_plots(df, output_name, sex, wd="."): +def marker_plots(df, output_name, kit, wd="."): Path(f"{wd}/MarkerPlots").mkdir(parents=True, exist_ok=True) - df["CE_Allele"] = df["CE_Allele"].astype(float) filt_df = df[df["allele_type"] == "Typed"] for sample_id in df["SampleID"].unique(): - with PdfPages(f"{wd}/MarkerPlots/{output_name}_{sample_id}_marker_plots.pdf") as pdf: - make_plot(filt_df, sample_id, filters=True, at=False) - pdf.savefig() - make_plot(df, sample_id) - pdf.savefig() - make_plot(df, sample_id, sameyaxis=True) - pdf.savefig() + if df[df["SampleID"] == sample_id].empty: + print(f"{sample_id} does not have any reads passing filter. Skipping to next sample.") + else: + with PdfPages(f"{wd}/MarkerPlots/{output_name}_{sample_id}_marker_plots.pdf") as pdf: + if not filt_df[filt_df["SampleID"] == sample_id].empty: + make_plot(filt_df, sample_id, output_name, kit, filters=True, at=False) + pdf.savefig() + make_plot(df, sample_id, output_name, kit) + pdf.savefig() + make_plot(df, sample_id, output_name, kit, sameyaxis=True) + pdf.savefig() -def make_plot(df, sample_id, sameyaxis=False, filters=False, at=True): +def make_plot(df, sample_id, output_name, kit, sameyaxis=False, filters=False, at=True): sample_df = df[df["SampleID"] == sample_id].copy() conditions = [ sample_df["allele_type"].str.contains("Typed"), @@ -389,12 +346,26 @@ def make_plot(df, sample_id, sameyaxis=False, filters=False, at=True): increase_value = int(math.ceil((max_yvalue / 5) / n)) * n fig = plt.figure(figsize=(30, 30)) n = 0 - for marker in sample_df["Locus"].unique(): - if marker in strs or marker in ystrs: - n += 1 - colors = {"Typed": "green", "Stutter": "blue", "BelowAT": "red", "Deleted": "purple"} - marker_df = sample_df[sample_df["Locus"] == marker].sort_values(by="CE_Allele") - ax = fig.add_subplot(6, 5, n) + if kit == "powerseq": + str_list = ( + str_lists["powerseq_ystrs"] if "sexloci" in output_name else str_lists["powerseq_strs"] + ) + else: + str_list = ( + str_lists["forenseq_ystrs"] if "sexloci" in output_name else str_lists["forenseq_strs"] + ) + for marker in str_list: + n += 1 + colors = {"Typed": "green", "Stutter": "blue", "BelowAT": "red", "Deleted": "purple"} + marker_df = sample_df[sample_df["Locus"] == marker].sort_values(by="CE_Allele") + ax = fig.add_subplot(6, 5, n) + if not marker_df.empty: + if marker == "AMELOGENIN": + for i, row in marker_df.iterrows(): + marker_df.loc[i, "CE_Allele"] = ( + 0 if marker_df.loc[i, "CE_Allele"] == "X" else 1 + ) + marker_df["CE_Allele"] = marker_df["CE_Allele"].astype(float) p = ax.bar( marker_df["CE_Allele"], marker_df["Reads"], @@ -407,15 +378,23 @@ def make_plot(df, sample_id, sameyaxis=False, filters=False, at=True): ax.text(round(min(marker_df["CE_Allele"])) - 0.9, at + (at * 0.1), f"AT", size=12) labels = marker_df["Type"].unique() handles = [plt.Rectangle((0, 0), 1, 1, color=colors[l]) for l in labels] + if marker == "AMELOGENIN": + plt.xlim(-1, 2) + ax.set_xticks(np.arange(-1, 3, 1)) + labels_x = ["", "X", "Y", ""] + ax.set_xticklabels(labels_x) if not filters: plt.legend(handles, labels, title="Allele Type") else: for i, row in marker_df.iterrows(): - marker_df.loc[i, "Label"] = ( - str(int(marker_df.loc[i, "CE_Allele"])) - if ".0" in str(marker_df.loc[i, "CE_Allele"]) - else str(marker_df.loc[i, "CE_Allele"]) - ) + if marker == "AMELOGENIN": + marker_df.loc[i, "Label"] = "X" if row["CE_Allele"] == 0 else "Y" + else: + marker_df.loc[i, "Label"] = ( + str(int(row["CE_Allele"])) + if ".0" in str(row["CE_Allele"]) + else str(row["CE_Allele"]) + ) ax.bar_label(p, labels=marker_df["Label"]) if sameyaxis: plt.ylim(0, max_yvalue) @@ -427,7 +406,7 @@ def make_plot(df, sample_id, sameyaxis=False, filters=False, at=True): 1.0, ) ) - ax.title.set_text(marker) + ax.title.set_text(marker) if sameyaxis: title = "Marker Plots for All Alleles With Same Y-Axis Scale" elif filters: @@ -460,6 +439,7 @@ def process_input( profile_type, data_type, output_type, + kit, strand="forward", nofiltering=False, separate=False, @@ -480,18 +460,20 @@ def process_input( ) if nofiltering: full_df["allele_type"] = "Typed" - marker_plots(full_df, input_name, sex) + marker_plots(full_df, input_name, kit) if output_type == "efm" or output_type == "mpsproto": - EFM_output(full_df, outpath, profile_type, data_type, brack_col, sex, separate) + EFM_output(full_df, outpath, profile_type, data_type, brack_col, sex, kit, separate) else: STRmix_output(full_df, outpath, profile_type, data_type, seq_col) else: dict_loc = {k: v for k, v in full_df.groupby(["SampleID", "Locus"])} - final_df, flags_df = process_strs(dict_loc, data_type, seq_col, brack_col) + final_df, flags_df = process_strs(dict_loc, data_type, seq_col, brack_col, kit) if final_df is not None: - marker_plots(final_df, input_name, sex) + marker_plots(final_df, input_name, kit) if output_type == "efm" or output_type == "mpsproto": - EFM_output(final_df, outpath, profile_type, data_type, brack_col, sex, separate) + EFM_output( + final_df, outpath, profile_type, data_type, brack_col, sex, kit, separate + ) else: STRmix_output(final_df, outpath, profile_type, data_type, seq_col) if info: @@ -515,6 +497,7 @@ def main( strand, custom, sex, + kit, ): input = str(input) if profile_type not in ("evidence", "reference"): @@ -534,6 +517,7 @@ def main( profile_type, data_type, output_type, + kit, strand=strand, nofiltering=nofilters, separate=separate, @@ -549,6 +533,7 @@ def main( profile_type, data_type, output_type, + kit, strand=strand, nofiltering=nofilters, separate=separate, @@ -571,4 +556,5 @@ def main( strand=snakemake.params.strand, custom=snakemake.params.custom, sex=snakemake.params.sex, + kit=snakemake.params.kit, ) diff --git a/lusSTR/wrappers/format.py b/lusSTR/wrappers/format.py index 410bc303..1ce3aa08 100644 --- a/lusSTR/wrappers/format.py +++ b/lusSTR/wrappers/format.py @@ -59,7 +59,7 @@ def parse_str_table_from_sheet(infile, sheet, exclude=None): def uas_format(infile, sexloci=False): - auto_strs = parse_str_table_from_sheet(infile, sheet="Autosomal STRs", exclude=["Amelogenin"]) + auto_strs = parse_str_table_from_sheet(infile, sheet="Autosomal STRs") sex_strs = None if sexloci is True: y_strs = parse_str_table_from_sheet(infile, "Y STRs") @@ -71,6 +71,7 @@ def uas_format(infile, sexloci=False): def nonuas_load(inpath, software, sexloci=False): """Format a directory of STRait Razor/GeneMarker output files.""" locus_list = [ + "Amelogenin", "CSF1PO", "D10S1248", "D12S391",