From 7752853ca7c95fd29aee91b8fafc90f841970ca0 Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Tue, 6 May 2025 06:20:18 -0400 Subject: [PATCH 01/21] processing amelogenin from UAS sample details report [skip ci] --- lusSTR/data/str_markers.json | 19 +++++++++++++++++++ lusSTR/scripts/marker.py | 24 ++++++++++++++++++++++++ lusSTR/wrappers/convert.py | 2 -- lusSTR/wrappers/format.py | 3 ++- 4 files changed, 45 insertions(+), 3 deletions(-) diff --git a/lusSTR/data/str_markers.json b/lusSTR/data/str_markers.json index 38c613d5..1527c157 100644 --- a/lusSTR/data/str_markers.json +++ b/lusSTR/data/str_markers.json @@ -1,4 +1,23 @@ { + "AMELOGENIN": { + "BasesToSubtract": 47, + "NumRepeats": 1, + "Repeats": [ + "AAAGTG" + ], + "NumBasesToSeparate": 0, + "ReverseCompNeeded": "No", + "LUS": "", + "Sec": "", + "Tert": "", + "Foren_5": 26, + "Foren_3": 37, + "Power_5": 10, + "Power_3": 37, + "Custom_5": 10, + "Custom_3": 37, + "Alleles": ["0", "1"] + }, "CSF1PO": { "BasesToSubtract": 0, "NumRepeats": 1, diff --git a/lusSTR/scripts/marker.py b/lusSTR/scripts/marker.py index ab91ae6f..bef170b6 100644 --- a/lusSTR/scripts/marker.py +++ b/lusSTR/scripts/marker.py @@ -355,6 +355,29 @@ def summary(self): ] +class STRMarker_Amelogenin(STRMarker): + @property + def canonical(self): + if self.uas_sequence == "AAAGTG": + return "Y" + else: + return "X" + + @property + def summary(self): + return [ + self.uas_sequence, + self.forward_sequence, + self.custom_sequence, + self.uas_sequence, + self.uas_sequence, + self.uas_sequence, + self.canonical, + "NA", + "NA", + ] + + class STRMarker_D8S1179(STRMarker): @property def flank_5p(self): @@ -1742,6 +1765,7 @@ def flank_5p(self): def STRMarkerObject(locus, sequence, software, custom=False, kit="forenseq"): constructors = { + "AMELOGENIN": STRMarker_Amelogenin, "D8S1179": STRMarker_D8S1179, "D13S317": STRMarker_D13S317, "D20S482": STRMarker_D20S482, diff --git a/lusSTR/wrappers/convert.py b/lusSTR/wrappers/convert.py index cc79317b..4841d242 100644 --- a/lusSTR/wrappers/convert.py +++ b/lusSTR/wrappers/convert.py @@ -58,8 +58,6 @@ def format_table(input, software, kit="forenseq", custom=False): locus = "PENTA E" if locus == "DYS385A/B" or locus == "DYS385": locus = "DYS385A-B" - if locus == "AMELOGENIN": - continue metadata = str_marker_data[locus] if kit == "forenseq": remove_5p = metadata["Foren_5"] diff --git a/lusSTR/wrappers/format.py b/lusSTR/wrappers/format.py index 410bc303..4e7c080f 100644 --- a/lusSTR/wrappers/format.py +++ b/lusSTR/wrappers/format.py @@ -59,7 +59,7 @@ def parse_str_table_from_sheet(infile, sheet, exclude=None): def uas_format(infile, sexloci=False): - auto_strs = parse_str_table_from_sheet(infile, sheet="Autosomal STRs", exclude=["Amelogenin"]) + auto_strs = parse_str_table_from_sheet(infile, sheet="Autosomal STRs") sex_strs = None if sexloci is True: y_strs = parse_str_table_from_sheet(infile, "Y STRs") @@ -71,6 +71,7 @@ def uas_format(infile, sexloci=False): def nonuas_load(inpath, software, sexloci=False): """Format a directory of STRait Razor/GeneMarker output files.""" locus_list = [ + "AMELOGENIN", "CSF1PO", "D10S1248", "D12S391", From fb465c4d715b31e1c34ff3e23ea0f34c99040e73 Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Wed, 7 May 2025 06:27:29 -0400 Subject: [PATCH 02/21] steps through convert can use amelogenin [skip ci] --- lusSTR/data/str_markers.json | 4 +-- lusSTR/scripts/marker.py | 68 +++++++++++++++++++++++++++++------- lusSTR/wrappers/convert.py | 6 +++- lusSTR/wrappers/format.py | 2 +- 4 files changed, 64 insertions(+), 16 deletions(-) diff --git a/lusSTR/data/str_markers.json b/lusSTR/data/str_markers.json index 1527c157..1cd584b6 100644 --- a/lusSTR/data/str_markers.json +++ b/lusSTR/data/str_markers.json @@ -1,6 +1,6 @@ { "AMELOGENIN": { - "BasesToSubtract": 47, + "BasesToSubtract": 0, "NumRepeats": 1, "Repeats": [ "AAAGTG" @@ -16,7 +16,7 @@ "Power_3": 37, "Custom_5": 10, "Custom_3": 37, - "Alleles": ["0", "1"] + "Alleles": ["X", "Y"] }, "CSF1PO": { "BasesToSubtract": 0, diff --git a/lusSTR/scripts/marker.py b/lusSTR/scripts/marker.py index bef170b6..666bf76e 100644 --- a/lusSTR/scripts/marker.py +++ b/lusSTR/scripts/marker.py @@ -63,7 +63,10 @@ def __init__(self, locus, sequence, software, custom=False, kit="forenseq"): @property def repeat_size(self): - return len(self.data["LUS"]) + if self.data["LUS"] != "": + return len(self.data["LUS"]) + else: + return 1 @property def repeats(self): @@ -356,6 +359,20 @@ def summary(self): class STRMarker_Amelogenin(STRMarker): + @property + def forward_sequence(self): + if self.software == "uas": + return self.sequence + front, back = self._uas_bases_to_trim() + if len(self.sequence) == 0: + back = None + else: + back *= -1 + if self.sequence[front:back] == "": + return "" + else: + return self.sequence[front:back] + @property def canonical(self): if self.uas_sequence == "AAAGTG": @@ -363,19 +380,46 @@ def canonical(self): else: return "X" + @property + def convert(self): + if self.forward_sequence == "": + return "" + else: + return self.forward_sequence + + @property + def custom_brack(self): + if self.forward_sequence == "": + return "" + else: + return "NA" + @property def summary(self): - return [ - self.uas_sequence, - self.forward_sequence, - self.custom_sequence, - self.uas_sequence, - self.uas_sequence, - self.uas_sequence, - self.canonical, - "NA", - "NA", - ] + if self.uas_sequence == "AAAGTG": + return [ + "AAAGTG", + "AAAGTG", + "AAAGTG", + "AAAGTG", + "NA", + "NA", + "Y", + "NA", + "NA", + ] + elif self.uas_sequence == "": + return [ + "", + "", + "", + "", + "NA", + "NA", + "X", + "NA", + "NA", + ] class STRMarker_D8S1179(STRMarker): diff --git a/lusSTR/wrappers/convert.py b/lusSTR/wrappers/convert.py index 4841d242..dbef0c5b 100644 --- a/lusSTR/wrappers/convert.py +++ b/lusSTR/wrappers/convert.py @@ -65,7 +65,11 @@ def format_table(input, software, kit="forenseq", custom=False): else: remove_5p = metadata["Power_5"] remove_3p = metadata["Power_3"] - if len(sequence) <= (remove_5p + remove_3p) and software != "uas": + if ( + len(sequence) <= (remove_5p + remove_3p) + and software != "uas" + and locus != "AMELOGENIN" + ): flank_summary = [ sampleid, project, diff --git a/lusSTR/wrappers/format.py b/lusSTR/wrappers/format.py index 4e7c080f..1ce3aa08 100644 --- a/lusSTR/wrappers/format.py +++ b/lusSTR/wrappers/format.py @@ -71,7 +71,7 @@ def uas_format(infile, sexloci=False): def nonuas_load(inpath, software, sexloci=False): """Format a directory of STRait Razor/GeneMarker output files.""" locus_list = [ - "AMELOGENIN", + "Amelogenin", "CSF1PO", "D10S1248", "D12S391", From 7a7ad3bdfdbdfc5323f2365c2b033283369d0667 Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Thu, 8 May 2025 06:32:00 -0400 Subject: [PATCH 03/21] fixed convert step for crappy sequences in amel and filtering amel sequences [skip ci] --- lusSTR/data/filters.json | 18 +++++++++++++ lusSTR/scripts/filter_settings.py | 38 +++++++++++++++++++++++++++- lusSTR/scripts/marker.py | 42 ++++++++++++++++++++----------- lusSTR/wrappers/convert.py | 2 +- lusSTR/wrappers/filter.py | 4 +++ 5 files changed, 88 insertions(+), 16 deletions(-) diff --git a/lusSTR/data/filters.json b/lusSTR/data/filters.json index 64f06b4b..d241505c 100644 --- a/lusSTR/data/filters.json +++ b/lusSTR/data/filters.json @@ -1,4 +1,22 @@ { + "AMELOGENIN": { + "MinimumNumberReadsForDynamicThresholds": 650, + "DetectionThresholdStaticCount": 10, + "DetectionThresholdDynamicPercent": 0, + "DetectionThresholdUse": "Static", + "AnalyticalThresholdStaticCount": 20, + "AnalyticalThresholdDynamicPercent": 0.017, + "AnalyticalThresholdUse": "Both", + "StochasticThresholdStaticCount": 20, + "StochasticThresholdDynamicPercent": 0.017, + "StochasticThresholdUse": "Both", + "MinimumHeterozygousBalanceThresholdDynamicPercent": 0.50, + "SameSizeThresholdDynamicPercent": 0, + "StutterThresholdDynamicPercent": 0, + "StutterForwardThresholdDynamicPercent": 0, + "Intercept": 0, + "Slope": 0 + }, "CSF1PO": { "MinimumNumberReadsForDynamicThresholds": 650, "DetectionThresholdStaticCount": 10, diff --git a/lusSTR/scripts/filter_settings.py b/lusSTR/scripts/filter_settings.py index e639a33b..63d1fc76 100644 --- a/lusSTR/scripts/filter_settings.py +++ b/lusSTR/scripts/filter_settings.py @@ -28,7 +28,9 @@ def get_filter_metadata_file(): def filters(locus_allele_info, locus, locus_reads, datatype, brack_col): metadata = filter_marker_data[locus] - if len(locus_allele_info) == 1: + if locus == "AMELOGENIN": + locus_allele_info = filter_amel(metadata, locus_allele_info, locus_reads) + elif len(locus_allele_info) == 1: locus_allele_info = single_allele_thresholds(metadata, locus_reads, locus_allele_info) else: locus_allele_info, locus_reads = multiple_allele_thresholds( @@ -42,6 +44,40 @@ def filters(locus_allele_info, locus, locus_reads, datatype, brack_col): return locus_allele_info +def filter_amel(metadata, amel_df, locus_reads): + for filter in ["Detection", "Analytical"]: + use = metadata[f"{filter}ThresholdUse"] + count = metadata[f"{filter}ThresholdStaticCount"] + perc = metadata[f"{filter}ThresholdDynamicPercent"] + thresh_perc = round(perc * locus_reads, 1) + if ( + use.lower() == "dynamic" + and locus_reads < metadata["MinimumNumberReadsForDynamicThresholds"] + ): + use = "static" + if use.lower() == "both": + thresh = thresh_perc if thresh_perc >= count else count + elif use.lower() == "static": + thresh = count + elif use.lower() == "dynamic": + thresh = thresh_perc + if filter == "Detection": + amel_dt = amel_df[amel_df["Reads"] >= thresh].reset_index(drop=True) + locus_reads = amel_df["Reads"].sum() + else: + for i in range(len(amel_dt)): + al_reads = amel_dt.loc[i, "Reads"] + if al_reads < thresh: + amel_dt.loc[i, ["allele_type", "perc_noise"]] = [ + "BelowAT", + round(al_reads / locus_reads, 3), + ] + else: + amel_dt.loc[i, "allele_tpye"] = "Typed" + print(amel_dt) + return amel_dt + + def single_allele_thresholds(metadata, locus_reads, single_all_df): if thresholds("Detection", metadata, locus_reads, single_all_df["Reads"][0])[1] is False: single_all_df = pd.DataFrame() diff --git a/lusSTR/scripts/marker.py b/lusSTR/scripts/marker.py index 666bf76e..2400cfaf 100644 --- a/lusSTR/scripts/marker.py +++ b/lusSTR/scripts/marker.py @@ -377,8 +377,10 @@ def forward_sequence(self): def canonical(self): if self.uas_sequence == "AAAGTG": return "Y" - else: + elif self.uas_sequence == "": return "X" + else: + return self.uas_sequence @property def convert(self): @@ -396,19 +398,19 @@ def custom_brack(self): @property def summary(self): - if self.uas_sequence == "AAAGTG": - return [ - "AAAGTG", - "AAAGTG", - "AAAGTG", - "AAAGTG", - "NA", - "NA", - "Y", - "NA", - "NA", - ] - elif self.uas_sequence == "": + # if self.uas_sequence == "AAAGTG": + # return [ + # "AAAGTG", + # "AAAGTG", + # "AAAGTG", + # "AAAGTG", + # "NA", + # "NA", + # "Y", + # "NA", + # "NA", + # ] + if self.uas_sequence == "": return [ "", "", @@ -420,6 +422,18 @@ def summary(self): "NA", "NA", ] + else: + return [ + self.uas_sequence, + self.forward_sequence, + self.custom_sequence, + self.convert, + self.convert, + self.custom_brack, + self.canonical, + "NA", + "NA", + ] class STRMarker_D8S1179(STRMarker): diff --git a/lusSTR/wrappers/convert.py b/lusSTR/wrappers/convert.py index dbef0c5b..2ad397f5 100644 --- a/lusSTR/wrappers/convert.py +++ b/lusSTR/wrappers/convert.py @@ -69,7 +69,7 @@ def format_table(input, software, kit="forenseq", custom=False): len(sequence) <= (remove_5p + remove_3p) and software != "uas" and locus != "AMELOGENIN" - ): + ) or (locus == "AMELOGENIN" and len(sequence) < (remove_5p + remove_3p)): flank_summary = [ sampleid, project, diff --git a/lusSTR/wrappers/filter.py b/lusSTR/wrappers/filter.py index 782b6be2..ce422c62 100644 --- a/lusSTR/wrappers/filter.py +++ b/lusSTR/wrappers/filter.py @@ -28,6 +28,7 @@ strs = [ + "AMELOGENIN", "CSF1PO", "D10S1248", "D12S391", @@ -146,6 +147,8 @@ def process_strs(dict_loc, datatype, seq_col, brack_col): filtered_df = filtered_df.replace({"nan": None}) final_df = pd.concat([final_df, filtered_df]) flags_df = pd.concat([flags_df, flags(filtered_df, datatype)]) + # elif locus == "AMELOGENIN": + # final_df = pd.concet([final_df, data_order]) if datatype == "ce" or datatype == "ngs": try: final_df = final_df.astype({"CE_Allele": "float64", "Reads": "int"}) @@ -155,6 +158,7 @@ def process_strs(dict_loc, datatype, seq_col, brack_col): def EFM_output(profile, outfile, profile_type, data_type, col, sex, separate=False): + profile = profile[profile["Locus"] != "AMELOGENIN"] if profile_type == "reference": profile = profile.query("allele_type == 'Typed'") else: From 086c7fe0036473fea0b87f818003acfd933fbbf5 Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Tue, 13 May 2025 06:18:52 -0400 Subject: [PATCH 04/21] fixed typo in amel filtering function [skip ci] --- lusSTR/scripts/filter_settings.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lusSTR/scripts/filter_settings.py b/lusSTR/scripts/filter_settings.py index 63d1fc76..77ed5d0c 100644 --- a/lusSTR/scripts/filter_settings.py +++ b/lusSTR/scripts/filter_settings.py @@ -73,8 +73,7 @@ def filter_amel(metadata, amel_df, locus_reads): round(al_reads / locus_reads, 3), ] else: - amel_dt.loc[i, "allele_tpye"] = "Typed" - print(amel_dt) + amel_dt.loc[i, "allele_type"] = "Typed" return amel_dt From 4634d425c97a519f9060dcfe1e9838c2f34b9eed Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Wed, 14 May 2025 06:18:47 -0400 Subject: [PATCH 05/21] amelogenin now plotting correctly in pdf [skip ci] --- lusSTR/wrappers/filter.py | 40 ++++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/lusSTR/wrappers/filter.py b/lusSTR/wrappers/filter.py index ce422c62..19736cee 100644 --- a/lusSTR/wrappers/filter.py +++ b/lusSTR/wrappers/filter.py @@ -147,13 +147,11 @@ def process_strs(dict_loc, datatype, seq_col, brack_col): filtered_df = filtered_df.replace({"nan": None}) final_df = pd.concat([final_df, filtered_df]) flags_df = pd.concat([flags_df, flags(filtered_df, datatype)]) - # elif locus == "AMELOGENIN": - # final_df = pd.concet([final_df, data_order]) - if datatype == "ce" or datatype == "ngs": - try: - final_df = final_df.astype({"CE_Allele": "float64", "Reads": "int"}) - except KeyError: - final_df = None + # if datatype == "ce" or datatype == "ngs": + # try: + # final_df = final_df.astype({"CE_Allele": "float64", "Reads": "int"}) + # except KeyError: + # final_df = None return final_df, flags_df @@ -266,6 +264,7 @@ def determine_max_num_alleles(allele_heights): def STRmix_output(profile, outdir, profile_type, data_type, seq_col): + profile = profile[profile["Locus"] != "AMELOGENIN"] Path(outdir).mkdir(parents=True, exist_ok=True) if profile_type == "reference": filtered_df = profile.query("allele_type == 'Typed'") @@ -365,7 +364,6 @@ def format_ref_table(new_rows, sample_data, datatype): def marker_plots(df, output_name, sex, wd="."): Path(f"{wd}/MarkerPlots").mkdir(parents=True, exist_ok=True) - df["CE_Allele"] = df["CE_Allele"].astype(float) filt_df = df[df["allele_type"] == "Typed"] for sample_id in df["SampleID"].unique(): with PdfPages(f"{wd}/MarkerPlots/{output_name}_{sample_id}_marker_plots.pdf") as pdf: @@ -398,6 +396,12 @@ def make_plot(df, sample_id, sameyaxis=False, filters=False, at=True): n += 1 colors = {"Typed": "green", "Stutter": "blue", "BelowAT": "red", "Deleted": "purple"} marker_df = sample_df[sample_df["Locus"] == marker].sort_values(by="CE_Allele") + if marker == "AMELOGENIN": + for i, row in marker_df.iterrows(): + marker_df.loc[i, "CE_Allele"] = ( + 0 if marker_df.loc[i, "CE_Allele"] == "X" else 1 + ) + marker_df["CE_Allele"] = marker_df["CE_Allele"].astype(float) ax = fig.add_subplot(6, 5, n) p = ax.bar( marker_df["CE_Allele"], @@ -411,15 +415,25 @@ def make_plot(df, sample_id, sameyaxis=False, filters=False, at=True): ax.text(round(min(marker_df["CE_Allele"])) - 0.9, at + (at * 0.1), f"AT", size=12) labels = marker_df["Type"].unique() handles = [plt.Rectangle((0, 0), 1, 1, color=colors[l]) for l in labels] + if marker == "AMELOGENIN": + plt.xlim(-1, 2) + ax.set_xticks(np.arange(-1, 3, 1)) + labels_x = ["", "X", "Y", ""] + ax.set_xticklabels(labels_x) if not filters: plt.legend(handles, labels, title="Allele Type") else: for i, row in marker_df.iterrows(): - marker_df.loc[i, "Label"] = ( - str(int(marker_df.loc[i, "CE_Allele"])) - if ".0" in str(marker_df.loc[i, "CE_Allele"]) - else str(marker_df.loc[i, "CE_Allele"]) - ) + if marker == "AMELOGENIN": + marker_df.loc[i, "Label"] = ( + "X" if marker_df.loc[i, "CE_Allele"] == 0 else "Y" + ) + else: + marker_df.loc[i, "Label"] = ( + str(int(marker_df.loc[i, "CE_Allele"])) + if ".0" in str(marker_df.loc[i, "CE_Allele"]) + else str(marker_df.loc[i, "CE_Allele"]) + ) ax.bar_label(p, labels=marker_df["Label"]) if sameyaxis: plt.ylim(0, max_yvalue) From 992e608c8b3de2dd602a22bb9d9e5e0993e2fa6f Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Thu, 15 May 2025 06:20:05 -0400 Subject: [PATCH 06/21] fixed bug in combining reads when using custom sequence ranges [skip ci] --- lusSTR/wrappers/convert.py | 42 +++++++++++++++++++++++++++++++++----- 1 file changed, 37 insertions(+), 5 deletions(-) diff --git a/lusSTR/wrappers/convert.py b/lusSTR/wrappers/convert.py index 2ad397f5..cc54fb6f 100644 --- a/lusSTR/wrappers/convert.py +++ b/lusSTR/wrappers/convert.py @@ -220,8 +220,36 @@ def check_vwa(marker, sequence, software, custom): return new_marker -def combine_reads(table, columns): - comb_table = table.groupby(columns[:-1], as_index=False)["Reads"].sum() +def combine_reads(table, columns, custom=False): + if custom: + print(table) + comb_table = ( + table.groupby( + [ + "SampleID", + "Project", + "Analysis", + "Locus", + "Custom_Range_Sequence", + "Custom_Bracketed_Notation", + "CE_Allele", + ] + ) + .agg( + { + "UAS_Output_Sequence": lambda x: ", ".join(x), + "Forward_Strand_Sequence": lambda x: ", ".join(x), + "UAS_Output_Bracketed_Notation": lambda x: ", ".join(x), + "Forward_Strand_Bracketed_Notation": lambda x: ", ".join(x), + "LUS": lambda x: ", ".join(x), + "LUS_Plus": lambda x: ", ".join(x), + "Reads": "sum", + } + ) + .reset_index() + ) + else: + comb_table = table.groupby(columns[:-1], as_index=False)["Reads"].sum() sorted = sort_table(comb_table) return sorted @@ -239,7 +267,7 @@ def remove_columns(column_list, remove_list): return column_list -def create_custom_outputtable(columns, table): +def create_custom_outputtable(columns, table, custom): remove_list = [ "UAS_Output_Sequence", "Forward_Strand_Sequence", @@ -275,7 +303,9 @@ def main(input, out, kit, software, sex, nocombine, custom): sex_final_table = combine_reads(sex_final_table, sex_columns) sex_final_table.to_csv(f"{full_table_name}_sexloci.txt", sep="\t", index=False) if custom: - sex_table_custom = create_custom_outputtable(sex_columns, sex_final_table) + sex_table_custom = create_custom_outputtable( + sex_columns, sex_final_table, custom=True + ) sex_table_custom.to_csv(f"{output_name}_sexloci.txt", index=False, sep="\t") else: sex_final_table.to_csv(f"{output_name}_sexloci.txt", sep="\t", index=False) @@ -289,7 +319,9 @@ def main(input, out, kit, software, sex, nocombine, custom): autosomal_final_table = combine_reads(autosomal_final_table, columns) autosomal_final_table.to_csv(f"{full_table_name}.txt", sep="\t", index=False) if custom: - custom_table_comb = create_custom_outputtable(columns, autosomal_final_table) + custom_table_comb = create_custom_outputtable( + columns, autosomal_final_table, custom=True + ) custom_table_comb.to_csv(out, sep="\t", index=False) else: autosomal_final_table.to_csv(out, sep="\t", index=False) From dc557e0fce5e9102e8630ff0960ce4a45e8de12d Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Fri, 16 May 2025 05:47:27 -0400 Subject: [PATCH 07/21] fixed bug with custom sequence ranges in amel [skip ci] --- lusSTR/scripts/marker.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/lusSTR/scripts/marker.py b/lusSTR/scripts/marker.py index 2400cfaf..4468f335 100644 --- a/lusSTR/scripts/marker.py +++ b/lusSTR/scripts/marker.py @@ -373,6 +373,22 @@ def forward_sequence(self): else: return self.sequence[front:back] + @property + def custom_sequence(self): + if self.custom: + custom_front = self.data["Custom_5"] + custom_back = self.data["Custom_3"] + if custom_back == 0: + custom_back = None + else: + custom_back *= -1 + if self.sequence[custom_front:custom_back] == "": + return "" + else: + return self.sequence[custom_front:custom_back] + else: + return None + @property def canonical(self): if self.uas_sequence == "AAAGTG": @@ -398,18 +414,6 @@ def custom_brack(self): @property def summary(self): - # if self.uas_sequence == "AAAGTG": - # return [ - # "AAAGTG", - # "AAAGTG", - # "AAAGTG", - # "AAAGTG", - # "NA", - # "NA", - # "Y", - # "NA", - # "NA", - # ] if self.uas_sequence == "": return [ "", From 4ce279cabae641554810b5b88374aa2789648bda Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Mon, 19 May 2025 06:31:31 -0400 Subject: [PATCH 08/21] began implementing amel into GUI marker plots [skip ci] --- lusSTR/cli/gui.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/lusSTR/cli/gui.py b/lusSTR/cli/gui.py index 1971a6a9..4b0dcfc3 100644 --- a/lusSTR/cli/gui.py +++ b/lusSTR/cli/gui.py @@ -201,7 +201,13 @@ def interactive_plots_allmarkers(sample_df, flagged_df): col = cols[n] container = col.container(border=True) sample_locus = sample_df["SampleID"].unique() + "_" + marker - marker_df = sample_df[sample_df["Locus"] == marker].sort_values(by="CE_Allele") + for i, row in sample_df.iterrows(): + if sample_df.loc[i, "Locus"] == "AMELOGENIN": + sample_df.loc[i, "CE_Allele"] = 0 if sample_df.loc[i, "CE_Allele"] == "X" else 1 + sample_df["CE_Allele"] = pd.to_numeric(sample_df["CE_Allele"]) + marker_df = sample_df[sample_df["Locus"] == marker].sort_values( + by=["CE_Allele", "allele_type"], ascending=[False, True] + ) if sample_locus in flagged_df["key"].values: marker = f"⚠️{marker}⚠️" plot = interactive_plots(marker_df, marker, max_yvalue, increase_value, all=True) @@ -240,9 +246,14 @@ def interactive_plots(df, locus, ymax, increase, all=False): ) plot.add_hline(y=at, line_width=3, line_dash="dot", line_color="gray") plot.add_annotation(text=f"AT", x=min_x + 0.1, y=at, showarrow=False, yshift=10) - plot.update_layout( - xaxis=dict(range=[min_x, max_x], tickmode="array", tickvals=np.arange(min_x, max_x, 1)) - ) + if locus == "AMELOGENIN": + plot.update_layout( + xaxis=dict(range=[-1, 2], tickmode="array", tickvals=["", "X", "Y", ""]) + ) + else: + plot.update_layout( + xaxis=dict(range=[min_x, max_x], tickmode="array", tickvals=np.arange(min_x, max_x, 1)) + ) if all: plot.update_layout( yaxis=dict(range=[0, ymax], tickmode="array", tickvals=np.arange(0, ymax, increase)) @@ -307,6 +318,10 @@ def interactive_setup(df1, file): ) interactive_plots_allmarkers(sample_df, flags) else: + for i, row in sample_df.iterrows(): + if sample_df.loc[i, "Locus"] == "AMELOGENIN": + sample_df.loc[i, "CE_Allele"] = 0 if sample_df.loc[i, "CE_Allele"] == "X" else 1 + sample_df["CE_Allele"] = pd.to_numeric(sample_df["CE_Allele"]) locus_key = f"{sample}_{locus}" if locus_key not in st.session_state: st.session_state[locus_key] = sample_df[sample_df["Locus"] == locus].reset_index( From 65b878a2e32a012481a6d63a76a8344d8152a9db Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Tue, 20 May 2025 12:28:06 -0400 Subject: [PATCH 09/21] fixed custom range for amel [skip ci] --- lusSTR/data/str_markers.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lusSTR/data/str_markers.json b/lusSTR/data/str_markers.json index 1cd584b6..cf030cb4 100644 --- a/lusSTR/data/str_markers.json +++ b/lusSTR/data/str_markers.json @@ -14,8 +14,8 @@ "Foren_3": 37, "Power_5": 10, "Power_3": 37, - "Custom_5": 10, - "Custom_3": 37, + "Custom_5": 0, + "Custom_3": 0, "Alleles": ["X", "Y"] }, "CSF1PO": { From 01472ee309f9df239d1f4c7462e51e27e9e02088 Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Tue, 20 May 2025 12:29:31 -0400 Subject: [PATCH 10/21] handling samples with no sequences passing filters [skip ci] --- lusSTR/scripts/filter_settings.py | 22 ++++++++++++---------- lusSTR/scripts/marker.py | 3 +-- lusSTR/wrappers/convert.py | 8 ++++++-- lusSTR/wrappers/filter.py | 30 ++++++++++++++++++------------ 4 files changed, 37 insertions(+), 26 deletions(-) diff --git a/lusSTR/scripts/filter_settings.py b/lusSTR/scripts/filter_settings.py index 77ed5d0c..88430dc6 100644 --- a/lusSTR/scripts/filter_settings.py +++ b/lusSTR/scripts/filter_settings.py @@ -30,17 +30,19 @@ def filters(locus_allele_info, locus, locus_reads, datatype, brack_col): metadata = filter_marker_data[locus] if locus == "AMELOGENIN": locus_allele_info = filter_amel(metadata, locus_allele_info, locus_reads) - elif len(locus_allele_info) == 1: - locus_allele_info = single_allele_thresholds(metadata, locus_reads, locus_allele_info) else: - locus_allele_info, locus_reads = multiple_allele_thresholds( - metadata, locus_reads, locus_allele_info - ) - locus_allele_info = ce_filtering( - locus_allele_info, locus_reads, metadata, datatype, brack_col - ) - if datatype != "ce": - locus_allele_info = same_size_filter(locus_allele_info, metadata, datatype) + locus_allele_info["CE_Allele"] = locus_allele_info["CE_Allele"].astype(float) + if len(locus_allele_info) == 1: + locus_allele_info = single_allele_thresholds(metadata, locus_reads, locus_allele_info) + else: + locus_allele_info, locus_reads = multiple_allele_thresholds( + metadata, locus_reads, locus_allele_info + ) + locus_allele_info = ce_filtering( + locus_allele_info, locus_reads, metadata, datatype, brack_col + ) + if datatype != "ce": + locus_allele_info = same_size_filter(locus_allele_info, metadata, datatype) return locus_allele_info diff --git a/lusSTR/scripts/marker.py b/lusSTR/scripts/marker.py index 4468f335..95aa2baa 100644 --- a/lusSTR/scripts/marker.py +++ b/lusSTR/scripts/marker.py @@ -376,8 +376,7 @@ def forward_sequence(self): @property def custom_sequence(self): if self.custom: - custom_front = self.data["Custom_5"] - custom_back = self.data["Custom_3"] + custom_front, custom_back = self._uas_bases_to_trim() if custom_back == 0: custom_back = None else: diff --git a/lusSTR/wrappers/convert.py b/lusSTR/wrappers/convert.py index cc54fb6f..8115c09f 100644 --- a/lusSTR/wrappers/convert.py +++ b/lusSTR/wrappers/convert.py @@ -65,8 +65,13 @@ def format_table(input, software, kit="forenseq", custom=False): else: remove_5p = metadata["Power_5"] remove_3p = metadata["Power_3"] + if custom: + if metadata["Custom_5"] < 0: + remove_5p = remove_5p - metadata["Custom_5"] + if metadata["Custom_3"] < 0: + remove_3p = remove_3p - metadata["Custom_3"] if ( - len(sequence) <= (remove_5p + remove_3p) + len(sequence) <= (remove_5p + remove_3p + len(metadata["LUS"])) and software != "uas" and locus != "AMELOGENIN" ) or (locus == "AMELOGENIN" and len(sequence) < (remove_5p + remove_3p)): @@ -222,7 +227,6 @@ def check_vwa(marker, sequence, software, custom): def combine_reads(table, columns, custom=False): if custom: - print(table) comb_table = ( table.groupby( [ diff --git a/lusSTR/wrappers/filter.py b/lusSTR/wrappers/filter.py index 19736cee..17607a3d 100644 --- a/lusSTR/wrappers/filter.py +++ b/lusSTR/wrappers/filter.py @@ -155,7 +155,7 @@ def process_strs(dict_loc, datatype, seq_col, brack_col): return final_df, flags_df -def EFM_output(profile, outfile, profile_type, data_type, col, sex, separate=False): +def EFM_output(profile, id_list, outfile, profile_type, data_type, col, sex, separate=False): profile = profile[profile["Locus"] != "AMELOGENIN"] if profile_type == "reference": profile = profile.query("allele_type == 'Typed'") @@ -263,7 +263,7 @@ def determine_max_num_alleles(allele_heights): return max_num_alleles -def STRmix_output(profile, outdir, profile_type, data_type, seq_col): +def STRmix_output(profile, outdir, profile_type, data_type, seq_col, id_list): profile = profile[profile["Locus"] != "AMELOGENIN"] Path(outdir).mkdir(parents=True, exist_ok=True) if profile_type == "reference": @@ -288,7 +288,6 @@ def STRmix_output(profile, outdir, profile_type, data_type, seq_col): {"Locus": {"VWA": "vWA", "PENTA D": "PentaD", "PENTA E": "PentaE"}}, inplace=True ) Path(outdir).mkdir(exist_ok=True) - id_list = strmix_profile["SampleID"].unique() for id in id_list: sample_df = strmix_profile[strmix_profile["SampleID"] == id].reset_index(drop=True) if profile_type == "evidence": @@ -366,13 +365,17 @@ def marker_plots(df, output_name, sex, wd="."): Path(f"{wd}/MarkerPlots").mkdir(parents=True, exist_ok=True) filt_df = df[df["allele_type"] == "Typed"] for sample_id in df["SampleID"].unique(): - with PdfPages(f"{wd}/MarkerPlots/{output_name}_{sample_id}_marker_plots.pdf") as pdf: - make_plot(filt_df, sample_id, filters=True, at=False) - pdf.savefig() - make_plot(df, sample_id) - pdf.savefig() - make_plot(df, sample_id, sameyaxis=True) - pdf.savefig() + if df[df["SampleID"] == sample_id].empty: + print(f"{sample_id} does not have any reads passing filter. Skipping to next sample.") + else: + with PdfPages(f"{wd}/MarkerPlots/{output_name}_{sample_id}_marker_plots.pdf") as pdf: + if not filt_df[filt_df["SampleID"] == sample_id].empty: + make_plot(filt_df, sample_id, filters=True, at=False) + pdf.savefig() + make_plot(df, sample_id) + pdf.savefig() + make_plot(df, sample_id, sameyaxis=True) + pdf.savefig() def make_plot(df, sample_id, sameyaxis=False, filters=False, at=True): @@ -505,13 +508,16 @@ def process_input( STRmix_output(full_df, outpath, profile_type, data_type, seq_col) else: dict_loc = {k: v for k, v in full_df.groupby(["SampleID", "Locus"])} + id_list = full_df["SampleID"].unique() final_df, flags_df = process_strs(dict_loc, data_type, seq_col, brack_col) if final_df is not None: marker_plots(final_df, input_name, sex) if output_type == "efm" or output_type == "mpsproto": - EFM_output(final_df, outpath, profile_type, data_type, brack_col, sex, separate) + EFM_output( + final_df, id_list, outpath, profile_type, data_type, brack_col, sex, separate + ) else: - STRmix_output(final_df, outpath, profile_type, data_type, seq_col) + STRmix_output(final_df, outpath, profile_type, data_type, seq_col, id_list) if info: name = os.path.basename(outpath) final_df.to_csv(f"{outpath}/{input_name}_sequence_info.csv", index=False) From ebc7fc3b2646d9b97e21c6d8769c60ab403d1f0c Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Tue, 20 May 2025 13:43:10 -0400 Subject: [PATCH 11/21] fixed plotting amel in gui [skip ci] --- lusSTR/cli/gui.py | 15 +++++++-------- lusSTR/wrappers/filter.py | 6 ++---- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/lusSTR/cli/gui.py b/lusSTR/cli/gui.py index 4b0dcfc3..e5f88f2e 100644 --- a/lusSTR/cli/gui.py +++ b/lusSTR/cli/gui.py @@ -248,7 +248,7 @@ def interactive_plots(df, locus, ymax, increase, all=False): plot.add_annotation(text=f"AT", x=min_x + 0.1, y=at, showarrow=False, yshift=10) if locus == "AMELOGENIN": plot.update_layout( - xaxis=dict(range=[-1, 2], tickmode="array", tickvals=["", "X", "Y", ""]) + xaxis=dict(tickvals=np.arange(-1, 2, 1), tickmode="array", ticktext=["", "X", "Y", ""]) ) else: plot.update_layout( @@ -318,15 +318,14 @@ def interactive_setup(df1, file): ) interactive_plots_allmarkers(sample_df, flags) else: - for i, row in sample_df.iterrows(): - if sample_df.loc[i, "Locus"] == "AMELOGENIN": - sample_df.loc[i, "CE_Allele"] = 0 if sample_df.loc[i, "CE_Allele"] == "X" else 1 - sample_df["CE_Allele"] = pd.to_numeric(sample_df["CE_Allele"]) + plot_df = sample_df + for i, row in plot_df.iterrows(): + if plot_df.loc[i, "Locus"] == "AMELOGENIN": + plot_df.loc[i, "CE_Allele"] = 0 if plot_df.loc[i, "CE_Allele"] == "X" else 1 + plot_df["CE_Allele"] = pd.to_numeric(plot_df["CE_Allele"]) locus_key = f"{sample}_{locus}" if locus_key not in st.session_state: - st.session_state[locus_key] = sample_df[sample_df["Locus"] == locus].reset_index( - drop=True - ) + st.session_state[locus_key] = plot_df[plot_df["Locus"] == locus].reset_index(drop=True) Type = [ "Deleted", "Typed", diff --git a/lusSTR/wrappers/filter.py b/lusSTR/wrappers/filter.py index 17607a3d..17dc321f 100644 --- a/lusSTR/wrappers/filter.py +++ b/lusSTR/wrappers/filter.py @@ -155,7 +155,7 @@ def process_strs(dict_loc, datatype, seq_col, brack_col): return final_df, flags_df -def EFM_output(profile, id_list, outfile, profile_type, data_type, col, sex, separate=False): +def EFM_output(profile, outfile, profile_type, data_type, col, sex, separate=False): profile = profile[profile["Locus"] != "AMELOGENIN"] if profile_type == "reference": profile = profile.query("allele_type == 'Typed'") @@ -513,9 +513,7 @@ def process_input( if final_df is not None: marker_plots(final_df, input_name, sex) if output_type == "efm" or output_type == "mpsproto": - EFM_output( - final_df, id_list, outpath, profile_type, data_type, brack_col, sex, separate - ) + EFM_output(final_df, outpath, profile_type, data_type, brack_col, sex, separate) else: STRmix_output(final_df, outpath, profile_type, data_type, seq_col, id_list) if info: From c2929b1ebcdd34b21369807a181d16d22605c9d9 Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Wed, 21 May 2025 16:34:05 -0400 Subject: [PATCH 12/21] added blank plots for missing loci [skip ci] --- lusSTR/wrappers/filter.py | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/lusSTR/wrappers/filter.py b/lusSTR/wrappers/filter.py index 17dc321f..780d1c45 100644 --- a/lusSTR/wrappers/filter.py +++ b/lusSTR/wrappers/filter.py @@ -147,11 +147,6 @@ def process_strs(dict_loc, datatype, seq_col, brack_col): filtered_df = filtered_df.replace({"nan": None}) final_df = pd.concat([final_df, filtered_df]) flags_df = pd.concat([flags_df, flags(filtered_df, datatype)]) - # if datatype == "ce" or datatype == "ngs": - # try: - # final_df = final_df.astype({"CE_Allele": "float64", "Reads": "int"}) - # except KeyError: - # final_df = None return final_df, flags_df @@ -361,7 +356,7 @@ def format_ref_table(new_rows, sample_data, datatype): return sort_df -def marker_plots(df, output_name, sex, wd="."): +def marker_plots(df, output_name, wd="."): Path(f"{wd}/MarkerPlots").mkdir(parents=True, exist_ok=True) filt_df = df[df["allele_type"] == "Typed"] for sample_id in df["SampleID"].unique(): @@ -370,15 +365,15 @@ def marker_plots(df, output_name, sex, wd="."): else: with PdfPages(f"{wd}/MarkerPlots/{output_name}_{sample_id}_marker_plots.pdf") as pdf: if not filt_df[filt_df["SampleID"] == sample_id].empty: - make_plot(filt_df, sample_id, filters=True, at=False) + make_plot(filt_df, sample_id, output_name, filters=True, at=False) pdf.savefig() - make_plot(df, sample_id) + make_plot(df, sample_id, output_name) pdf.savefig() - make_plot(df, sample_id, sameyaxis=True) + make_plot(df, sample_id, output_name, sameyaxis=True) pdf.savefig() -def make_plot(df, sample_id, sameyaxis=False, filters=False, at=True): +def make_plot(df, sample_id, output_name, sameyaxis=False, filters=False, at=True): sample_df = df[df["SampleID"] == sample_id].copy() conditions = [ sample_df["allele_type"].str.contains("Typed"), @@ -394,18 +389,20 @@ def make_plot(df, sample_id, sameyaxis=False, filters=False, at=True): increase_value = int(math.ceil((max_yvalue / 5) / n)) * n fig = plt.figure(figsize=(30, 30)) n = 0 - for marker in sample_df["Locus"].unique(): - if marker in strs or marker in ystrs: - n += 1 - colors = {"Typed": "green", "Stutter": "blue", "BelowAT": "red", "Deleted": "purple"} - marker_df = sample_df[sample_df["Locus"] == marker].sort_values(by="CE_Allele") + str_list = ystrs if "sexloci" in output_name else strs + for marker in str_list: + n += 1 + colors = {"Typed": "green", "Stutter": "blue", "BelowAT": "red", "Deleted": "purple"} + marker_df = sample_df[sample_df["Locus"] == marker].sort_values(by="CE_Allele") + ax = fig.add_subplot(6, 5, n) + if not marker_df.empty: if marker == "AMELOGENIN": for i, row in marker_df.iterrows(): marker_df.loc[i, "CE_Allele"] = ( 0 if marker_df.loc[i, "CE_Allele"] == "X" else 1 ) marker_df["CE_Allele"] = marker_df["CE_Allele"].astype(float) - ax = fig.add_subplot(6, 5, n) + # ax = fig.add_subplot(6, 5, n) p = ax.bar( marker_df["CE_Allele"], marker_df["Reads"], @@ -448,7 +445,7 @@ def make_plot(df, sample_id, sameyaxis=False, filters=False, at=True): 1.0, ) ) - ax.title.set_text(marker) + ax.title.set_text(marker) if sameyaxis: title = "Marker Plots for All Alleles With Same Y-Axis Scale" elif filters: @@ -501,7 +498,7 @@ def process_input( ) if nofiltering: full_df["allele_type"] = "Typed" - marker_plots(full_df, input_name, sex) + marker_plots(full_df, input_name) if output_type == "efm" or output_type == "mpsproto": EFM_output(full_df, outpath, profile_type, data_type, brack_col, sex, separate) else: @@ -511,7 +508,7 @@ def process_input( id_list = full_df["SampleID"].unique() final_df, flags_df = process_strs(dict_loc, data_type, seq_col, brack_col) if final_df is not None: - marker_plots(final_df, input_name, sex) + marker_plots(final_df, input_name) if output_type == "efm" or output_type == "mpsproto": EFM_output(final_df, outpath, profile_type, data_type, brack_col, sex, separate) else: From 694c980227016a9a7cdca9e0a86e2ab8c4169237 Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Fri, 23 May 2025 14:02:19 -0400 Subject: [PATCH 13/21] made str lists specific for each kit [skip ci] --- lusSTR/workflows/strs.smk | 3 +- lusSTR/wrappers/filter.py | 100 +++++++++++++++++++++++++++++++------- 2 files changed, 85 insertions(+), 18 deletions(-) diff --git a/lusSTR/workflows/strs.smk b/lusSTR/workflows/strs.smk index 3e2ad4e7..fb77fa4d 100644 --- a/lusSTR/workflows/strs.smk +++ b/lusSTR/workflows/strs.smk @@ -150,7 +150,8 @@ rule filter: filters=config["nofilters"], strand=config["strand"], custom=config["custom_ranges"], - sex=config["sex"] + sex=config["sex"], + kit=config["kit"] script: lusSTR.wrapper("filter") diff --git a/lusSTR/wrappers/filter.py b/lusSTR/wrappers/filter.py index 780d1c45..1f818d2b 100644 --- a/lusSTR/wrappers/filter.py +++ b/lusSTR/wrappers/filter.py @@ -27,7 +27,34 @@ import sys -strs = [ +p_strs = [ + "AMELOGENIN", + "CSF1PO", + "D10S1248", + "D12S391", + "D13S317", + "D16S539", + "D18S51", + "D19S433", + "D1S1656", + "D21S11", + "D22S1045", + "D2S1338", + "D2S441", + "D3S1358", + "D5S818", + "D6S1043", + "D7S820", + "D8S1179", + "FGA", + "PENTA D", + "PENTA E", + "TH01", + "TPOX", + "VWA", +] + +f_strs = [ "AMELOGENIN", "CSF1PO", "D10S1248", @@ -58,7 +85,7 @@ "VWA", ] -ystrs = [ +p_ystrs = [ "DYS19", "DYS385A-B", "DYS389II", @@ -82,6 +109,31 @@ "Y-GATA-H4", ] +f_ystrs = [ + "DYS19", + "DYS385A-B", + "DYS389II", + "DYS390", + "DYS391", + "DYS392", + "DYS437", + "DYS438", + "DYS439", + "DYS448", + "DYS460", + "DYS481", + "DYS505", + "DYS522", + "DYS533", + "DYS549", + "DYS570", + "DYS576", + "DYS612", + "DYS635", + "DYS643", + "Y-GATA-H4", +] + def get_filter_metadata_file(): return importlib.resources.files("lusSTR") / "data/filters.json" @@ -91,9 +143,11 @@ def get_filter_metadata_file(): filter_marker_data = json.load(fh) -def process_strs(dict_loc, datatype, seq_col, brack_col): +def process_strs(dict_loc, datatype, seq_col, brack_col, kit): final_df = pd.DataFrame() flags_df = pd.DataFrame() + strs = p_strs if kit == "powerseq" else f_strs + ystrs = p_ystrs if kit == "powerseq" else f_ystrs for key, value in dict_loc.items(): data = dict_loc[key].reset_index(drop=True) if datatype == "ce": @@ -150,20 +204,20 @@ def process_strs(dict_loc, datatype, seq_col, brack_col): return final_df, flags_df -def EFM_output(profile, outfile, profile_type, data_type, col, sex, separate=False): +def EFM_output(profile, outfile, profile_type, data_type, col, sex, kit, separate=False): profile = profile[profile["Locus"] != "AMELOGENIN"] if profile_type == "reference": profile = profile.query("allele_type == 'Typed'") else: profile = profile.query("allele_type != ['BelowAT', 'Deleted']") - efm_profile = populate_efm_profile(profile, data_type, col, sex) + efm_profile = populate_efm_profile(profile, data_type, col, sex, kit) if separate: write_sample_specific_efm_profiles(efm_profile, profile_type, data_type, outfile) else: write_aggregate_efm_profile(efm_profile, profile_type, data_type, outfile) -def populate_efm_profile(profile, data_type, colname, sex): +def populate_efm_profile(profile, data_type, colname, sex, kit): if data_type == "ce": prof_col = "CE_Allele" elif data_type == "lusplus": @@ -186,6 +240,8 @@ def populate_efm_profile(profile, data_type, colname, sex): allele_heights[row.SampleID][row.Locus][row.Allele] = int(row.Reads) max_num_alleles = determine_max_num_alleles(allele_heights) reformatted_profile = list() + strs = p_strs if kit == "powerseq" else f_strs + ystrs = p_ystrs if kit == "powerseq" else f_ystrs for sampleid, loci in allele_heights.items(): for locusid, alleles in loci.items(): allele_list, height_list = list(), list() @@ -356,7 +412,7 @@ def format_ref_table(new_rows, sample_data, datatype): return sort_df -def marker_plots(df, output_name, wd="."): +def marker_plots(df, output_name, kit, wd="."): Path(f"{wd}/MarkerPlots").mkdir(parents=True, exist_ok=True) filt_df = df[df["allele_type"] == "Typed"] for sample_id in df["SampleID"].unique(): @@ -365,15 +421,15 @@ def marker_plots(df, output_name, wd="."): else: with PdfPages(f"{wd}/MarkerPlots/{output_name}_{sample_id}_marker_plots.pdf") as pdf: if not filt_df[filt_df["SampleID"] == sample_id].empty: - make_plot(filt_df, sample_id, output_name, filters=True, at=False) + make_plot(filt_df, sample_id, output_name, kit, filters=True, at=False) pdf.savefig() - make_plot(df, sample_id, output_name) + make_plot(df, sample_id, output_name, kit) pdf.savefig() - make_plot(df, sample_id, output_name, sameyaxis=True) + make_plot(df, sample_id, output_name, kit, sameyaxis=True) pdf.savefig() -def make_plot(df, sample_id, output_name, sameyaxis=False, filters=False, at=True): +def make_plot(df, sample_id, output_name, kit, sameyaxis=False, filters=False, at=True): sample_df = df[df["SampleID"] == sample_id].copy() conditions = [ sample_df["allele_type"].str.contains("Typed"), @@ -389,7 +445,10 @@ def make_plot(df, sample_id, output_name, sameyaxis=False, filters=False, at=Tru increase_value = int(math.ceil((max_yvalue / 5) / n)) * n fig = plt.figure(figsize=(30, 30)) n = 0 - str_list = ystrs if "sexloci" in output_name else strs + if kit == "powerseq": + str_list = p_ystrs if "sexloci" in output_name else p_strs + else: + str_list = f_ystrs if "sexloci" in output_name else f_strs for marker in str_list: n += 1 colors = {"Typed": "green", "Stutter": "blue", "BelowAT": "red", "Deleted": "purple"} @@ -478,6 +537,7 @@ def process_input( profile_type, data_type, output_type, + kit, strand="forward", nofiltering=False, separate=False, @@ -498,19 +558,21 @@ def process_input( ) if nofiltering: full_df["allele_type"] = "Typed" - marker_plots(full_df, input_name) + marker_plots(full_df, input_name, kit) if output_type == "efm" or output_type == "mpsproto": - EFM_output(full_df, outpath, profile_type, data_type, brack_col, sex, separate) + EFM_output(full_df, outpath, profile_type, data_type, brack_col, sex, kit, separate) else: STRmix_output(full_df, outpath, profile_type, data_type, seq_col) else: dict_loc = {k: v for k, v in full_df.groupby(["SampleID", "Locus"])} id_list = full_df["SampleID"].unique() - final_df, flags_df = process_strs(dict_loc, data_type, seq_col, brack_col) + final_df, flags_df = process_strs(dict_loc, data_type, seq_col, brack_col, kit) if final_df is not None: - marker_plots(final_df, input_name) + marker_plots(final_df, input_name, kit) if output_type == "efm" or output_type == "mpsproto": - EFM_output(final_df, outpath, profile_type, data_type, brack_col, sex, separate) + EFM_output( + final_df, outpath, profile_type, data_type, brack_col, sex, kit, separate + ) else: STRmix_output(final_df, outpath, profile_type, data_type, seq_col, id_list) if info: @@ -534,6 +596,7 @@ def main( strand, custom, sex, + kit, ): input = str(input) if profile_type not in ("evidence", "reference"): @@ -553,6 +616,7 @@ def main( profile_type, data_type, output_type, + kit, strand=strand, nofiltering=nofilters, separate=separate, @@ -568,6 +632,7 @@ def main( profile_type, data_type, output_type, + kit, strand=strand, nofiltering=nofilters, separate=separate, @@ -590,4 +655,5 @@ def main( strand=snakemake.params.strand, custom=snakemake.params.custom, sex=snakemake.params.sex, + kit=snakemake.params.kit, ) From f089083466d49871e67d11b87936cc4b11245f36 Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Tue, 27 May 2025 06:03:46 -0400 Subject: [PATCH 14/21] added empty plots to GUI for missing markers [skip ci] --- lusSTR/cli/gui.py | 70 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 68 insertions(+), 2 deletions(-) diff --git a/lusSTR/cli/gui.py b/lusSTR/cli/gui.py index e5f88f2e..588cfcbb 100644 --- a/lusSTR/cli/gui.py +++ b/lusSTR/cli/gui.py @@ -22,6 +22,7 @@ import pandas as pd from pathlib import Path import plotly.express as px +import plotly.graph_objs as go import streamlit as st from streamlit_option_menu import option_menu import yaml @@ -146,6 +147,64 @@ def main(): # lusSTR Home Page # ##################################################################### +p_strs = [ + "AMELOGENIN", + "CSF1PO", + "D10S1248", + "D12S391", + "D13S317", + "D16S539", + "D18S51", + "D19S433", + "D1S1656", + "D21S11", + "D22S1045", + "D2S1338", + "D2S441", + "D3S1358", + "D5S818", + "D6S1043", + "D7S820", + "D8S1179", + "FGA", + "PENTA D", + "PENTA E", + "TH01", + "TPOX", + "VWA", +] + +f_strs = [ + "AMELOGENIN", + "CSF1PO", + "D10S1248", + "D12S391", + "D13S317", + "D16S539", + "D17S1301", + "D18S51", + "D19S433", + "D1S1656", + "D20S482", + "D21S11", + "D22S1045", + "D2S1338", + "D2S441", + "D3S1358", + "D4S2408", + "D5S818", + "D6S1043", + "D7S820", + "D8S1179", + "D9S1122", + "FGA", + "PENTA D", + "PENTA E", + "TH01", + "TPOX", + "VWA", +] + def show_home_page(): @@ -197,7 +256,9 @@ def interactive_plots_allmarkers(sample_df, flagged_df): max_yvalue = (int(math.ceil(max_reads / n)) * n) + n increase_value = int(math.ceil((max_yvalue / 5) / n)) * n n = 0 - for marker in sample_df["Locus"].unique(): + all_loci = f_strs if st.session_state.kit == "forenseq" else p_strs + missing_loci = [x for x in all_loci if x not in sample_df["Locus"].unique()] + for marker in all_loci: col = cols[n] container = col.container(border=True) sample_locus = sample_df["SampleID"].unique() + "_" + marker @@ -210,7 +271,12 @@ def interactive_plots_allmarkers(sample_df, flagged_df): ) if sample_locus in flagged_df["key"].values: marker = f"⚠️{marker}⚠️" - plot = interactive_plots(marker_df, marker, max_yvalue, increase_value, all=True) + if marker in missing_loci: + marker = f"⚠️{marker}⚠️" + plot = go.Figure() + plot.update_layout(title=marker) + else: + plot = interactive_plots(marker_df, marker, max_yvalue, increase_value, all=True) container.plotly_chart(plot, use_container_width=True) if n == 3: n = 0 From cd44d52ffcb51eb8ded25ed8dcf1d5efbad668d5 Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Wed, 28 May 2025 06:07:54 -0400 Subject: [PATCH 15/21] removed extra marker in powerseq list [skip ci] --- lusSTR/cli/gui.py | 1 - 1 file changed, 1 deletion(-) diff --git a/lusSTR/cli/gui.py b/lusSTR/cli/gui.py index 588cfcbb..bcd3a005 100644 --- a/lusSTR/cli/gui.py +++ b/lusSTR/cli/gui.py @@ -163,7 +163,6 @@ def main(): "D2S441", "D3S1358", "D5S818", - "D6S1043", "D7S820", "D8S1179", "FGA", From 1fb78462be75572ed0a1e54e14b771b21b5c4fb1 Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Wed, 28 May 2025 06:08:11 -0400 Subject: [PATCH 16/21] removed extra marker in powerseq list [skip ci] --- lusSTR/wrappers/filter.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/lusSTR/wrappers/filter.py b/lusSTR/wrappers/filter.py index 1f818d2b..37bbb7eb 100644 --- a/lusSTR/wrappers/filter.py +++ b/lusSTR/wrappers/filter.py @@ -43,7 +43,6 @@ "D2S441", "D3S1358", "D5S818", - "D6S1043", "D7S820", "D8S1179", "FGA", @@ -205,7 +204,6 @@ def process_strs(dict_loc, datatype, seq_col, brack_col, kit): def EFM_output(profile, outfile, profile_type, data_type, col, sex, kit, separate=False): - profile = profile[profile["Locus"] != "AMELOGENIN"] if profile_type == "reference": profile = profile.query("allele_type == 'Typed'") else: @@ -266,7 +264,8 @@ def populate_efm_profile(profile, data_type, colname, sex, kit): for col in height_columns: efm_profile[col] = efm_profile[col].astype("Int64") efm_profile = efm_profile.sort_values(by=["SampleName", "Marker"]) - return efm_profile + efm_profile_noamel = efm_profile[efm_profile["Marker"] != "AMELOGENIN"] + return efm_profile_noamel def write_sample_specific_efm_profiles(efm_profile, profile_type, data_type, outdir): @@ -314,7 +313,7 @@ def determine_max_num_alleles(allele_heights): return max_num_alleles -def STRmix_output(profile, outdir, profile_type, data_type, seq_col, id_list): +def STRmix_output(profile, outdir, profile_type, data_type, seq_col): profile = profile[profile["Locus"] != "AMELOGENIN"] Path(outdir).mkdir(parents=True, exist_ok=True) if profile_type == "reference": @@ -339,6 +338,7 @@ def STRmix_output(profile, outdir, profile_type, data_type, seq_col, id_list): {"Locus": {"VWA": "vWA", "PENTA D": "PentaD", "PENTA E": "PentaE"}}, inplace=True ) Path(outdir).mkdir(exist_ok=True) + id_list = strmix_profile["SampleID"].unique() for id in id_list: sample_df = strmix_profile[strmix_profile["SampleID"] == id].reset_index(drop=True) if profile_type == "evidence": @@ -565,7 +565,6 @@ def process_input( STRmix_output(full_df, outpath, profile_type, data_type, seq_col) else: dict_loc = {k: v for k, v in full_df.groupby(["SampleID", "Locus"])} - id_list = full_df["SampleID"].unique() final_df, flags_df = process_strs(dict_loc, data_type, seq_col, brack_col, kit) if final_df is not None: marker_plots(final_df, input_name, kit) @@ -574,7 +573,7 @@ def process_input( final_df, outpath, profile_type, data_type, brack_col, sex, kit, separate ) else: - STRmix_output(final_df, outpath, profile_type, data_type, seq_col, id_list) + STRmix_output(final_df, outpath, profile_type, data_type, seq_col) if info: name = os.path.basename(outpath) final_df.to_csv(f"{outpath}/{input_name}_sequence_info.csv", index=False) From eec3ac129c70d257656136d35494e3d8a0618945 Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Wed, 28 May 2025 06:11:37 -0400 Subject: [PATCH 17/21] began updating tests [skip ci] --- .../LUSPlus_sequence_info.csv | 42 ++++++------- .../NGS_stutter_test/Sample1_nofilter.csv | 54 ++++++++-------- .../data/STRaitRazor_output_test_A001.csv | 2 + .../tests/data/STRait_Razor_test_output.csv | 34 ++++++++++ lusSTR/tests/data/UAS_bulk_test.csv | 4 ++ lusSTR/tests/data/lusstr_output.csv | 2 + .../Positive_Control_evidence_ngs.csv | 62 +++++++++---------- lusSTR/tests/data/testformat.csv | 2 + 8 files changed, 123 insertions(+), 79 deletions(-) diff --git a/lusSTR/tests/data/LUSPlus_stutter_test/LUSPlus_sequence_info.csv b/lusSTR/tests/data/LUSPlus_stutter_test/LUSPlus_sequence_info.csv index 5a2541dd..84c079d7 100644 --- a/lusSTR/tests/data/LUSPlus_stutter_test/LUSPlus_sequence_info.csv +++ b/lusSTR/tests/data/LUSPlus_stutter_test/LUSPlus_sequence_info.csv @@ -1,24 +1,24 @@ SampleID,Locus,CE_Allele,LUS_Plus,Reads,allele_type,parent_allele1,parent_allele2,allele1_ref_reads,allele2_ref_reads,perc_noise,perc_stutter -Sample1,D4S2408,10.0,10_10_0,1022,Typed,,,,,, -Sample1,D4S2408,9.0,9_9_0,116,-1_stutter/+1_stutter,10_10_0,8_8_0,1022.0,1050.0,, -Sample1,D4S2408,8.0,8_8_0,1050,Typed,,,,,, -Sample1,D8S1179,14.0,14_12_1_0,869,Typed,,,,,, -Sample1,D8S1179,13.0,13_11_1_0,184,-1_stutter,14_12_1_0,,869.0,,,0.212 -Sample1,D8S1179,12.0,12_10_1_0,37,-2_stutter,14_12_1_0,,869.0,,,0.201 -Sample1,D9S1122,13.0,13_11,948,Typed,,,,,, -Sample1,D9S1122,12.0,12_10,108,-1_stutter,13_11,,948.0,,,0.114 -Sample1,D9S1122,11.0,11_11,991,Typed,,,,,, -Sample1,D9S1122,10.0,10_10,87,-1_stutter,11_11,,991.0,,,0.088 -Sample1,FGA,23.0,23_15_3_0,1436,Typed,,,,,, -Sample1,FGA,22.0,22_14_3_0,262,-1_stutter,23_15_3_0,,1436.0,,,0.182 -Sample1,FGA,21.0,21_13_3_0,48,BelowAT,,,,,0.013, -Sample1,FGA,20.0,20_12_3_0,1750,Typed,,,,,, -Sample1,FGA,18.0,18_10_3_0,181,Typed,,,,,, -Sample1,FGA,17.0,17_9_3_0,15,BelowAT,,,,,0.004, -Sample1,PENTA D,15.0,15_15,50,Typed,,,,,, -Sample1,PENTA D,13.0,13_13,1000,Typed,,,,,, +Sample1,D4S2408,10,10_10_0,1022,Typed,,,,,, +Sample1,D4S2408,9,9_9_0,116,-1_stutter/+1_stutter,10_10_0,8_8_0,1022.0,1050.0,, +Sample1,D4S2408,8,8_8_0,1050,Typed,,,,,, +Sample1,D8S1179,14,14_12_1_0,869,Typed,,,,,, +Sample1,D8S1179,13,13_11_1_0,184,-1_stutter,14_12_1_0,,869.0,,,0.212 +Sample1,D8S1179,12,12_10_1_0,37,-2_stutter,14_12_1_0,,869.0,,,0.201 +Sample1,D9S1122,13,13_11,948,Typed,,,,,, +Sample1,D9S1122,12,12_10,108,-1_stutter,13_11,,948.0,,,0.114 +Sample1,D9S1122,11,11_11,991,Typed,,,,,, +Sample1,D9S1122,10,10_10,87,-1_stutter,11_11,,991.0,,,0.088 +Sample1,FGA,23,23_15_3_0,1436,Typed,,,,,, +Sample1,FGA,22,22_14_3_0,262,-1_stutter,23_15_3_0,,1436.0,,,0.182 +Sample1,FGA,21,21_13_3_0,48,BelowAT,,,,,0.013, +Sample1,FGA,20,20_12_3_0,1750,Typed,,,,,, +Sample1,FGA,18,18_10_3_0,181,Typed,,,,,, +Sample1,FGA,17,17_9_3_0,15,BelowAT,,,,,0.004, +Sample1,PENTA D,15,15_15,50,Typed,,,,,, +Sample1,PENTA D,13,13_13,1000,Typed,,,,,, Sample1,PENTA E,7.0,7_7,505,Typed,,,,,, -Sample1,TH01,7.0,7_7,2197,Typed,,,,,, -Sample1,TH01,6.0,6_6,1632,Typed,,,,,, -Sample1,TH01,5.0,5_5,66,BelowAT,,,,,0.017, +Sample1,TH01,7,7_7,2197,Typed,,,,,, +Sample1,TH01,6,6_6,1632,Typed,,,,,, +Sample1,TH01,5,5_5,66,BelowAT,,,,,0.017, Sample1,TPOX,11.0,11_11,15,BelowAT,,,,,1.0, diff --git a/lusSTR/tests/data/NGS_stutter_test/Sample1_nofilter.csv b/lusSTR/tests/data/NGS_stutter_test/Sample1_nofilter.csv index 7531c6f3..b83fce36 100644 --- a/lusSTR/tests/data/NGS_stutter_test/Sample1_nofilter.csv +++ b/lusSTR/tests/data/NGS_stutter_test/Sample1_nofilter.csv @@ -1,28 +1,28 @@ Locus,CE Allele,Allele Seq,Reads -D4S2408,8.0,ATCTATCTATCTATCTATCTATCTATCTATCT,1000 -D4S2408,9.0,ATCTATCTATCTATCTATCTATCTATCTATCTATCT,1357 -D4S2408,10.0,ATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT,900 -D8S1179,12.0,TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTA,26 -D8S1179,12.0,TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,11 -D8S1179,13.0,TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,95 -D8S1179,13.0,TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,89 -D8S1179,14.0,TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,739 -D8S1179,14.0,TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,130 -D9S1122,10.0,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,87 -D9S1122,11.0,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,991 -D9S1122,12.0,TAGATCGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,108 -D9S1122,13.0,TAGATCGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,948 -FGA,17.0,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,15 -FGA,18.0,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,181 -FGA,20.0,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,1750 -FGA,21.0,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,48 -FGA,22.0,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,262 -FGA,23.0,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,1436 -PentaD,13.0,AAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,1000 -PentaD,15.0,AAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,50 -PentaE,7.0,AAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,505 -TH01,5.0,AATGAATGAATGAATGAATG,66 -TH01,6.0,AATGAATGAATGAATGAATGAATG,1632 -TH01,7.0,AATGAATGAATGAATGAATGAATGAATG,2197 -TPOX,11.0,AATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATG,15 -vWA,16.0,TCTATCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCCATCTA,6 +D4S2408,8,ATCTATCTATCTATCTATCTATCTATCTATCT,1000 +D4S2408,9,ATCTATCTATCTATCTATCTATCTATCTATCTATCT,1357 +D4S2408,10,ATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT,900 +D8S1179,12,TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTA,26 +D8S1179,12,TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,11 +D8S1179,13,TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,95 +D8S1179,13,TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,89 +D8S1179,14,TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,739 +D8S1179,14,TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,130 +D9S1122,10,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,87 +D9S1122,11,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,991 +D9S1122,12,TAGATCGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,108 +D9S1122,13,TAGATCGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,948 +FGA,17,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,15 +FGA,18,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,181 +FGA,20,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,1750 +FGA,21,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,48 +FGA,22,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,262 +FGA,23,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,1436 +PentaD,13,AAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,1000 +PentaD,15,AAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,50 +PentaE,7,AAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,505 +TH01,5,AATGAATGAATGAATGAATG,66 +TH01,6,AATGAATGAATGAATGAATGAATG,1632 +TH01,7,AATGAATGAATGAATGAATGAATGAATG,2197 +TPOX,11,AATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATG,15 +vWA,16,TCTATCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCCATCTA,6 diff --git a/lusSTR/tests/data/STRaitRazor_output_test_A001.csv b/lusSTR/tests/data/STRaitRazor_output_test_A001.csv index 219ee03f..c4a2a09a 100644 --- a/lusSTR/tests/data/STRaitRazor_output_test_A001.csv +++ b/lusSTR/tests/data/STRaitRazor_output_test_A001.csv @@ -1,4 +1,6 @@ Locus,Total_Reads,Sequence,SampleID,Project,Analysis +Amelogenin,226,TAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAGTGGTCCCAATTTTACAGTTCCTACCAT,A001,NA,NA +Amelogenin,162,TAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAGTGGTCCTGATTTTACAGTTCCTACCAC,A001,NA,NA CSF1PO,547,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A001,NA,NA CSF1PO,25,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A001,NA,NA CSF1PO,7,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A001,NA,NA diff --git a/lusSTR/tests/data/STRait_Razor_test_output.csv b/lusSTR/tests/data/STRait_Razor_test_output.csv index 57449926..959433cf 100644 --- a/lusSTR/tests/data/STRait_Razor_test_output.csv +++ b/lusSTR/tests/data/STRait_Razor_test_output.csv @@ -1,4 +1,6 @@ Locus,Total_Reads,Sequence,SampleID,Project,Analysis +Amelogenin,226,TAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAGTGGTCCCAATTTTACAGTTCCTACCAT,A001,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,162,TAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAGTGGTCCTGATTTTACAGTTCCTACCAC,A001,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,547,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A001,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,25,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A001,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,7,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A001,STRait_Razor_test_output,STRait_Razor_test_output @@ -610,6 +612,8 @@ vWA,2,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAG vWA,2,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGATCAAT,A001,STRait_Razor_test_output,STRait_Razor_test_output vWA,2,AATACATAGGATGGATGGATAGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGATCAAT,A001,STRait_Razor_test_output,STRait_Razor_test_output vWA,2,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGATCAAT,A001,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,249,TAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAGTGGTCCCAATTTTACAGTTCCTACCAT,A002,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,171,TAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAGTGGTCCTGATTTTACAGTTCCTACCAC,A002,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,498,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A002,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,402,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A002,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,41,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A002,STRait_Razor_test_output,STRait_Razor_test_output @@ -1450,6 +1454,8 @@ vWA,2,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATTGATAG vWA,2,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGGCAGACAGATAGATCAAT,A002,STRait_Razor_test_output,STRait_Razor_test_output vWA,2,AATACATAGGATGGGTGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGATCAAT,A002,STRait_Razor_test_output,STRait_Razor_test_output vWA,2,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGATAAAT,A002,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,313,TAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAGTGGTCCCAATTTTACAGTTCCTACCAT,A003,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,167,TAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAGTGGTCCTGATTTTACAGTTCCTACCAC,A003,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,696,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A003,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,35,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A003,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,10,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A003,STRait_Razor_test_output,STRait_Razor_test_output @@ -2313,6 +2319,8 @@ vWA,3,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAG vWA,3,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGATCAAT,A003,STRait_Razor_test_output,STRait_Razor_test_output vWA,2,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGATAGATCAAT,A003,STRait_Razor_test_output,STRait_Razor_test_output vWA,2,AATACATAGGATGGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGATAGATCAAT,A003,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,178,TAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAGTGGTCCTGATTTTACAGTTCCTACCAC,A004,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,135,TAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAGTGGTCCCAATTTTACAGTTCCTACCAT,A004,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,469,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A004,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,381,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A004,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,22,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A004,STRait_Razor_test_output,STRait_Razor_test_output @@ -3173,6 +3181,11 @@ vWA,2,AATACATAGGATGGATGGATAGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAG vWA,2,AATACATAGGATGGATGGATGGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGATCAAT,A004,STRait_Razor_test_output,STRait_Razor_test_output vWA,2,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGACAGATAGATCAAT,A004,STRait_Razor_test_output,STRait_Razor_test_output vWA,2,AATACATAGGATGGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGATCAAT,A004,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,322,TAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAGTGGTCCCAATTTTACAGTTCCTACCAT,A005,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,299,TAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAGTGGTCCTGATTTTACAGTTCCTACCAC,A005,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,3,TAGTGGGTGGATTCATCATCCCAAATAAAGTGGTTTCTCAAGTGGTCCCAATTTTACAGTTCCTACCAT,A005,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,2,TAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAGTGGTCCTGATTTTTCAGTTCCTACCAC,A005,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,2,TAGTGGATGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAGTGGTCCCAATTTTACAGTTCCTACCAT,A005,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,817,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A005,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,28,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A005,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,13,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A005,STRait_Razor_test_output,STRait_Razor_test_output @@ -4101,6 +4114,9 @@ vWA,2,AATACATAGGATGGATAGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAG vWA,2,AATACATAGGATGGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGATCAAT,A005,STRait_Razor_test_output,STRait_Razor_test_output vWA,2,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATGGATAGATAGATAGATAGATAGATAGACAGACAGACAGATAGATCAAT,A005,STRait_Razor_test_output,STRait_Razor_test_output vWA,2,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGATAGTTCAAT,A005,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,255,TAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAGTGGTCCCAATTTTACAGTTCCTACCAT,A006,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,196,TAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAGTGGTCCTGATTTTACAGTTCCTACCAC,A006,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,2,TAGCGTGTTGATTCTTTATCCCAGATGTTTCTCAAGTGGTCCTGATTTTACAGTTCCTACCAC,A006,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,429,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A006,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,390,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A006,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,59,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A006,STRait_Razor_test_output,STRait_Razor_test_output @@ -4892,6 +4908,8 @@ vWA,2,AATACATAGGATGGATGGATAGATGGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAG vWA,2,AATACATAGGATGGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGATCAAT,A006,STRait_Razor_test_output,STRait_Razor_test_output vWA,2,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAAACAGACAGATAGATCAAT,A006,STRait_Razor_test_output,STRait_Razor_test_output vWA,2,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGACAGATAGATCAAT,A006,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,230,TAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAGTGGTCCTGATTTTACAGTTCCTACCAC,A007,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,212,TAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAGTGGTCCCAATTTTACAGTTCCTACCAT,A007,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,864,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A007,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,57,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A007,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,8,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A007,STRait_Razor_test_output,STRait_Razor_test_output @@ -5885,6 +5903,10 @@ vWA,3,AATACATAGGATGGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAG vWA,2,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGATCAAT,A007,STRait_Razor_test_output,STRait_Razor_test_output vWA,2,AATACATAGGATGGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGATCAAT,A007,STRait_Razor_test_output,STRait_Razor_test_output vWA,2,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGATAGATCAAT,A007,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,385,TAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAGTGGTCCTGATTTTACAGTTCCTACCAC,A008,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,259,TAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAGTGGTCCCAATTTTACAGTTCCTACCAT,A008,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,3,TAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAGTGGTCCTGATTTTACCGTTCCTACCAC,A008,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,2,TAGTGGGTGGATACTTCATCCCAAATAAAGTGGTTTCTCAAGTGGTCCCAATTTTACAGTTCCTACCAT,A008,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,523,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A008,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,483,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A008,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,30,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A008,STRait_Razor_test_output,STRait_Razor_test_output @@ -6808,6 +6830,8 @@ vWA,2,AATACATAGAATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAG vWA,2,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATGGATAGATAGATAGACAGACAGACAGACAGATAGATCAAT,A008,STRait_Razor_test_output,STRait_Razor_test_output vWA,2,AATACATAGGATGGATGGATAGATGGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGATCAAT,A008,STRait_Razor_test_output,STRait_Razor_test_output vWA,2,GATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGATCAAT,A008,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,317,TAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAGTGGTCCCAATTTTACAGTTCCTACCAT,A009,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,194,TAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAGTGGTCCTGATTTTACAGTTCCTACCAC,A009,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,407,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A009,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,338,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A009,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,16,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A009,STRait_Razor_test_output,STRait_Razor_test_output @@ -7722,6 +7746,9 @@ vWA,2,AATACATAGGATGGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAG vWA,2,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGGCAGATAGATCAAT,A009,STRait_Razor_test_output,STRait_Razor_test_output vWA,2,AATACATAGGATGGATGGATAGATGGACAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGATCAAT,A009,STRait_Razor_test_output,STRait_Razor_test_output vWA,2,AATACATAGGATGGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGATCAAT,A009,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,227,TAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAGTGGTCCCAATTTTACAGTTCCTACCAT,A010,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,143,TAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAGTGGTCCTGATTTTACAGTTCCTACCAC,A010,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,2,TAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAGTGGTCCCAATTTTATAGTTCCTACCAT,A010,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,664,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A010,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,39,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A010,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,2,CGTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A010,STRait_Razor_test_output,STRait_Razor_test_output @@ -8718,6 +8745,8 @@ vWA,3,GATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAG vWA,3,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGATAGATCAAT,A010,STRait_Razor_test_output,STRait_Razor_test_output vWA,2,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGATAGACAGACAGATAGATCAAT,A010,STRait_Razor_test_output,STRait_Razor_test_output vWA,2,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGATAGATCAAT,A010,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,276,TAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAGTGGTCCTGATTTTACAGTTCCTACCAC,A011,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,228,TAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAGTGGTCCCAATTTTACAGTTCCTACCAT,A011,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,449,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A011,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,272,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A011,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,28,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A011,STRait_Razor_test_output,STRait_Razor_test_output @@ -9677,6 +9706,11 @@ vWA,3,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAG vWA,2,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGATAGATAGATCAAT,A011,STRait_Razor_test_output,STRait_Razor_test_output vWA,2,AATACATAGGATGGATGGATAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACGGACAGACAGATAGATCAAT,A011,STRait_Razor_test_output,STRait_Razor_test_output vWA,2,AATACATAGGATGGATGGATAGATGGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGATCAAT,A011,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,418,TAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAGTGGTCCCAATTTTACAGTTCCTACCAT,A012,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,339,TAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAGTGGTCCTGATTTTACAGTTCCTACCAC,A012,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,2,TAGTGTGTTGATTCTCTATCCCAGATGTTTCTCAAGTGGTCCTGATTTTACAGTTCCTACCAC,A012,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,2,TAGTGGGTGGATTCTTCGTCCCAAATAAAGTGGTTTCTCAAGTGGTCCCAATTTTACAGTTCCTACCAT,A012,STRait_Razor_test_output,STRait_Razor_test_output +Amelogenin,2,TAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTATCAAGTGGTCCCAATTTTACAGTTCCTACCAT,A012,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,1131,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A012,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,43,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A012,STRait_Razor_test_output,STRait_Razor_test_output CSF1PO,13,CTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTT,A012,STRait_Razor_test_output,STRait_Razor_test_output diff --git a/lusSTR/tests/data/UAS_bulk_test.csv b/lusSTR/tests/data/UAS_bulk_test.csv index 88663214..064b8952 100644 --- a/lusSTR/tests/data/UAS_bulk_test.csv +++ b/lusSTR/tests/data/UAS_bulk_test.csv @@ -1,4 +1,6 @@ Locus,Reads,Repeat Sequence,SampleID,Project,Analysis +Amelogenin,143,,Positive Control,Project1,Analysis1 +Amelogenin,283,AAAGTG,Positive Control,Project1,Analysis1 D1S1656,33,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGGTGTGTGTGTG,Positive Control,Project1,Analysis1 D1S1656,13,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATGTGTGTGTG,Positive Control,Project1,Analysis1 D1S1656,231,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGGTGTGTGTGTG,Positive Control,Project1,Analysis1 @@ -127,6 +129,8 @@ D22S1045,13,ATTATTATTATTATTATTATTATTATTATTATTACTATTATT,Positive Control,Project1 D22S1045,146,ATTATTATTATTATTATTATTATTATTATTATTATTACTATTATT,Positive Control,Project1,Analysis1 D22S1045,1746,ATTATTATTATTATTATTATTATTATTATTATTATTATTACTATTATT,Positive Control,Project1,Analysis1 D22S1045,27,ATTATTATTATTATTATTATTATTATTATTATTATTATTATTACTATTATT,Positive Control,Project1,Analysis1 +Amelogenin,143,,Positive Control2,Project1,Analysis1 +Amelogenin,283,AAAGTG,Positive Control2,Project1,Analysis1 D1S1656,33,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGGTGTGTGTGTG,Positive Control2,Project1,Analysis1 D1S1656,13,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATGTGTGTGTG,Positive Control2,Project1,Analysis1 D1S1656,231,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGGTGTGTGTGTG,Positive Control2,Project1,Analysis1 diff --git a/lusSTR/tests/data/lusstr_output.csv b/lusSTR/tests/data/lusstr_output.csv index a53ea100..cce2ce0c 100644 --- a/lusSTR/tests/data/lusstr_output.csv +++ b/lusSTR/tests/data/lusstr_output.csv @@ -1,4 +1,6 @@ Locus,Reads,Repeat Sequence,SampleID,Project,Analysis +Amelogenin,143,,Positive Control,Project1,Analysis1 +Amelogenin,283,AAAGTG,Positive Control,Project1,Analysis1 D1S1656,33,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGGTGTGTGTGTG,Positive Control,Project1,Analysis1 D1S1656,13,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATGTGTGTGTG,Positive Control,Project1,Analysis1 D1S1656,231,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGGTGTGTGTGTG,Positive Control,Project1,Analysis1 diff --git a/lusSTR/tests/data/lusstr_output/Positive_Control_evidence_ngs.csv b/lusSTR/tests/data/lusstr_output/Positive_Control_evidence_ngs.csv index f5901b88..ab397eff 100644 --- a/lusSTR/tests/data/lusstr_output/Positive_Control_evidence_ngs.csv +++ b/lusSTR/tests/data/lusstr_output/Positive_Control_evidence_ngs.csv @@ -1,6 +1,6 @@ Locus,CE Allele,Allele Seq,Reads -CSF1PO,11.0,AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT,29 -CSF1PO,12.0,AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT,702 +CSF1PO,11.0,ATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT,29 +CSF1PO,12.0,ATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT,702 D10S1248,12.0,GGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAA,163 D10S1248,13.0,GGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAA,1050 D10S1248,14.0,GGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAA,116 @@ -23,12 +23,12 @@ D18S51,15.0,AGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAAAGAGAG D18S51,16.0,AGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAAAGAGAGAG,1009 D18S51,17.0,AGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAAAGAGAGAG,165 D18S51,18.0,AGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAAAGAGAGAG,999 -D19S433,12.0,AAGGAAAGAAGGTAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAGAGAGGAAGAAAGAGAG,122 -D19S433,13.0,AAGGAAAGAAGGTAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAGAGAGGAAGAAAGAGAG,1782 -D19S433,14.0,AAGGAAAGAAGGTAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAGAGAGGAAGAAAGAGAG,1621 -D1S1656,11.0,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGGTGTGTGTGTG,33 -D1S1656,12.0,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGGTGTGTGTGTG,231 -D1S1656,13.0,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATGTGTGTGTG,162 +D19S433,12.0,CTCTCTTTCTTCCTCTCTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTACCTTCTTTCCTT,122 +D19S433,13.0,CTCTCTTTCTTCCTCTCTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTACCTTCTTTCCTT,1782 +D19S433,14.0,CTCTCTTTCTTCCTCTCTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTACCTTCTTTCCTT,1621 +D1S1656,11.0,CACACACACACCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,33 +D1S1656,12.0,CACACACACACCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,231 +D1S1656,13.0,CACACACACATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,162 D20S482,13.0,AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT,337 D20S482,14.0,AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT,3136 D20S482,15.0,AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT,2731 @@ -37,10 +37,10 @@ D21S11,29.0,TCTATCTATCTATCTATCTGTCTGTCTGTCTGTCTGTCTGTCTATCTATCTATATCTATCTATCTATC D21S11,31.2,TCTATCTATCTATCTATCTATCTGTCTGTCTGTCTGTCTGTCTGTCTATCTATCTATATCTATCTATCTATCATCTATCTATCCATATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATATCTA,1064 D22S1045,15.0,ATTATTATTATTATTATTATTATTATTATTATTATTACTATTATT,146 D22S1045,16.0,ATTATTATTATTATTATTATTATTATTATTATTATTATTACTATTATT,1746 -D2S1338,21.0,TGCCTGCCTGCCTGCCTGCCTGCCTGCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCGTCCTTCCTTCC,246 -D2S1338,22.0,TGCCTGCCTGCCTGCCTGCCTGCCTGCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCGTCCTTCCTTCC,2165 -D2S1338,24.0,TGCCTGCCTGCCTGCCTGCCTGCCTGCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCGTCCTTCCTTCC,259 -D2S1338,25.0,TGCCTGCCTGCCTGCCTGCCTGCCTGCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCGTCCTTCCTTCC,1656 +D2S1338,21.0,GGAAGGAAGGACGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGCAGGCAGGCAGGCAGGCAGGCAGGCA,246 +D2S1338,22.0,GGAAGGAAGGACGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGCAGGCAGGCAGGCAGGCAGGCAGGCA,2165 +D2S1338,24.0,GGAAGGAAGGACGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGCAGGCAGGCAGGCAGGCAGGCAGGCA,259 +D2S1338,25.0,GGAAGGAAGGACGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGCAGGCAGGCAGGCAGGCAGGCAGGCA,1656 D2S441,10.0,TCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,1781 D2S441,14.0,TCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATTTATCTATCTA,1330 D3S1358,16.0,TCTATCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,135 @@ -48,15 +48,15 @@ D3S1358,17.0,TCTATCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT D3S1358,18.0,TCTATCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,1855 D4S2408,8.0,ATCTATCTATCTATCTATCTATCTATCTATCT,38 D4S2408,9.0,ATCTATCTATCTATCTATCTATCTATCTATCTATCT,1357 -D5S818,11.0,AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAG,21 -D5S818,12.0,AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAG,382 -D6S1043,11.0,AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT,219 -D6S1043,12.0,AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT,2088 -D6S1043,19.0,AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATACATAGATAGATAGATAGATAGAT,138 -D6S1043,20.0,AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATACATAGATAGATAGATAGATAGAT,1487 -D7S820,8.0,GATAGATAGATAGATAGATAGATAGATAGATAGACAGATTGATAGTTT,517 -D7S820,10.0,GATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGATTGATAGTTT,33 -D7S820,11.0,GATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGATTGATAGTTT,522 +D5S818,11.0,CTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT,21 +D5S818,12.0,CTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT,382 +D6S1043,11.0,ATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT,219 +D6S1043,12.0,ATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT,2088 +D6S1043,19.0,ATCTATCTATCTATCTATCTATGTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT,138 +D6S1043,20.0,ATCTATCTATCTATCTATCTATGTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT,1487 +D7S820,8.0,AAACTATCAATCTGTCTATCTATCTATCTATCTATCTATCTATCTATC,517 +D7S820,10.0,AAACTATCAATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATC,33 +D7S820,11.0,AAACTATCAATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATC,522 D8S1179,13.0,TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,135 D8S1179,14.0,TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,971 D8S1179,15.0,TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,661 @@ -64,21 +64,21 @@ D9S1122,11.0,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,131 D9S1122,11.0,TAGATCGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,115 D9S1122,12.0,TAGATCGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,1551 D9S1122,12.0,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,1427 -FGA,19.0,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,135 -FGA,20.0,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,1043 -FGA,22.0,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,182 -FGA,23.0,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,1023 +FGA,19.0,GGAAGGAAGGAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAAAGAAAGAAAGAAA,135 +FGA,20.0,GGAAGGAAGGAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAAAGAAAGAAAGAAA,1043 +FGA,22.0,GGAAGGAAGGAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAAAGAAAGAAAGAAA,182 +FGA,23.0,GGAAGGAAGGAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAAAGAAAGAAAGAAA,1023 PentaD,12.0,AAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,289 PentaD,13.0,AAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,278 -PentaE,7.0,AAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,505 -PentaE,14.0,AAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,261 +PentaE,7.0,TCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTT,505 +PentaE,14.0,TCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTT,261 TH01,5.0,AATGAATGAATGAATGAATG,247 TH01,6.0,AATGAATGAATGAATGAATGAATG,4542 TH01,8.3,AATGAATGAATGAATGAATGATGAATGAATGAATG,151 TH01,9.3,AATGAATGAATGAATGAATGAATGATGAATGAATGAATG,3581 TPOX,10.0,AATGAATGAATGAATGAATGAATGAATGAATGAATGAATG,59 TPOX,11.0,AATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATG,1216 -vWA,15.0,TCTATCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCCATCTA,23 -vWA,16.0,TCTATCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCCATCTA,377 -vWA,18.0,TCTATCTGTCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCCATCTA,43 -vWA,19.0,TCTATCTGTCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCCATCTA,325 +vWA,15.0,TAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGATAGA,23 +vWA,16.0,TAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGATAGA,377 +vWA,18.0,TAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGA,43 +vWA,19.0,TAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGA,325 diff --git a/lusSTR/tests/data/testformat.csv b/lusSTR/tests/data/testformat.csv index a53ea100..cce2ce0c 100644 --- a/lusSTR/tests/data/testformat.csv +++ b/lusSTR/tests/data/testformat.csv @@ -1,4 +1,6 @@ Locus,Reads,Repeat Sequence,SampleID,Project,Analysis +Amelogenin,143,,Positive Control,Project1,Analysis1 +Amelogenin,283,AAAGTG,Positive Control,Project1,Analysis1 D1S1656,33,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGGTGTGTGTGTG,Positive Control,Project1,Analysis1 D1S1656,13,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATGTGTGTGTG,Positive Control,Project1,Analysis1 D1S1656,231,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGGTGTGTGTGTG,Positive Control,Project1,Analysis1 From 740c5eaa388a4747fac68db3fe2551d86bcc5f2a Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Mon, 2 Jun 2025 05:57:38 -0400 Subject: [PATCH 18/21] updated remaining tests --- .../tests/data/genemarker/genemarker_test.csv | 2 + .../tests/data/genemarker/genemarker_test.txt | 2 + .../genemarker/genemarker_test_flanks.txt | 2 + lusSTR/tests/data/lusstr_output.txt | 2 + .../Positive_Control_evidence_ngs.csv | 62 +++++++++---------- lusSTR/wrappers/convert.py | 4 +- lusSTR/wrappers/filter.py | 4 -- 7 files changed, 42 insertions(+), 36 deletions(-) diff --git a/lusSTR/tests/data/genemarker/genemarker_test.csv b/lusSTR/tests/data/genemarker/genemarker_test.csv index 142ac4b4..944ea126 100644 --- a/lusSTR/tests/data/genemarker/genemarker_test.csv +++ b/lusSTR/tests/data/genemarker/genemarker_test.csv @@ -1,4 +1,6 @@ Locus,Total_Reads,Sequence,SampleID,Project,Analysis +Amelogenin,14189,TCAGCTATGAGGTAATTTTTCTCTTTACTAATTTTGACCATTGTTTGCGTTAACAATGCCCTGGGCTCTGTAAAGAATAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAGTGGTCCTGATTTTACAGTTCCTACCACCAGCTTCCCA,2800M_strresults_filtered,NA,NA +Amelogenin,11986,TCAGCTATGAGGTAATTTTTCTCTTTACTAATTTTGATCACTGTTTGCATTAGCAGTCCCCTGGGCTCTGTAAAGAATAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAGTGGTCCCAATTTTACAGTTCCTACCATCAGCTTCCCA,2800M_strresults_filtered,NA,NA PentaE,6733,TAATGATTACATAACATACATGTGTGTAAAGTGCTTAGTATCATGATTGATACATGGAAAGAATTCTCTTATTTGGGTTATTAATTGAGAAAACTCCTTACAATTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTGAGAC,2800M_strresults_filtered,NA,NA PentaE,4746,TAATGATTACATAACATACATGTGTGTAAAGTGCTTAGTATCATGATTGATACATGGAAAGAATTCTCTTATTTGGGTTATTAATTGAGAAAACTCCTTACAATTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTGAGAC,2800M_strresults_filtered,NA,NA D18S51,602,AGGCTGCAGTGAGCCATGTTCATGCCACTGCACTTCACTCTGAGTGACAAATTGAGACCTTGTCTCAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAAAGAGAGAGGAAAGAAAGAGAAAAAGAAAAGAAATAGTAGCAACTGTTATTGTA,2800M_strresults_filtered,NA,NA diff --git a/lusSTR/tests/data/genemarker/genemarker_test.txt b/lusSTR/tests/data/genemarker/genemarker_test.txt index 3b3fc202..326eb682 100644 --- a/lusSTR/tests/data/genemarker/genemarker_test.txt +++ b/lusSTR/tests/data/genemarker/genemarker_test.txt @@ -69,3 +69,5 @@ SampleID Project Analysis Locus UAS_Output_Sequence Forward_Strand_Sequence UAS_ 2800M_strresults_filtered NA NA D10S1248 GGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAA GGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAA [GGAA]12 [GGAA]12 12 12_12 12_12 741 2800M_strresults_filtered NA NA CSF1PO AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT ATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT [AGAT]12 [ATCT]12 12 12_12 12_12_0 14044 2800M_strresults_filtered NA NA CSF1PO AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT ATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT [AGAT]11 [ATCT]11 11 11_11 11_11_0 1047 +2800M_strresults_filtered NA NA AMELOGENIN GGTAATTTTTCTCTTTACTAATTTTGACCATTGTTTGCGTTAACAATGCCCTGGGCTCTGTAAAGAATAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAG GGTAATTTTTCTCTTTACTAATTTTGACCATTGTTTGCGTTAACAATGCCCTGGGCTCTGTAAAGAATAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAG GGTAATTTTTCTCTTTACTAATTTTGACCATTGTTTGCGTTAACAATGCCCTGGGCTCTGTAAAGAATAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAG GGTAATTTTTCTCTTTACTAATTTTGACCATTGTTTGCGTTAACAATGCCCTGGGCTCTGTAAAGAATAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAG GGTAATTTTTCTCTTTACTAATTTTGACCATTGTTTGCGTTAACAATGCCCTGGGCTCTGTAAAGAATAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAG NA NA 14189 +2800M_strresults_filtered NA NA AMELOGENIN GGTAATTTTTCTCTTTACTAATTTTGATCACTGTTTGCATTAGCAGTCCCCTGGGCTCTGTAAAGAATAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAG GGTAATTTTTCTCTTTACTAATTTTGATCACTGTTTGCATTAGCAGTCCCCTGGGCTCTGTAAAGAATAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAG GGTAATTTTTCTCTTTACTAATTTTGATCACTGTTTGCATTAGCAGTCCCCTGGGCTCTGTAAAGAATAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAG GGTAATTTTTCTCTTTACTAATTTTGATCACTGTTTGCATTAGCAGTCCCCTGGGCTCTGTAAAGAATAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAG GGTAATTTTTCTCTTTACTAATTTTGATCACTGTTTGCATTAGCAGTCCCCTGGGCTCTGTAAAGAATAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAG NA NA 11986 diff --git a/lusSTR/tests/data/genemarker/genemarker_test_flanks.txt b/lusSTR/tests/data/genemarker/genemarker_test_flanks.txt index 9740c817..a445bbc4 100644 --- a/lusSTR/tests/data/genemarker/genemarker_test_flanks.txt +++ b/lusSTR/tests/data/genemarker/genemarker_test_flanks.txt @@ -71,3 +71,5 @@ SampleID Project Analysis Locus Reads CE_Allele Full_Sequence 5_Flank_Bracketed_ 2800M_strresults_filtered NA NA D10S1248 741 12 CCCCAGGACCAATCTGGTCACAAACATATTAATGAATTGAACAAATGAGTGAGTGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAA CCCC AGGA CCAA TCTG GTCA CAAA CATA TTAA TGAA TT GAAC AAAT [GAGT]2 [GGAA]12 2800M_strresults_filtered NA NA CSF1PO 14044 12 CTAAGTACTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTTCTATCTATGAAGGCAGTTACTGTTAATATCTTCATTTTACAGGTAGGAAAACTGAGACACAGGGTGGTTAGCAACCTGCTAGTCCTTGGCAGACTCAG CTA AGTA CT TCCT [ATCT]12 A [ATCT]3 T [CTAT]2 GAAG GCAG TTAC TGTT AATA TCTT CATT TTAC AGGT AGGA AAAC TGAG ACAC AGGG TGGT TAG CA ACCT GCTA GTCC TTGG CAGA CTCA G 2800M_strresults_filtered NA NA CSF1PO 1047 11 CTAAGTACTTCCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTAATCTATCTATCTTCTATCTATGAAGGCAGTTACTGTTAATATCTTCATTTTACAGGTAGGAAAACTGAGACACAGGGTGGTTAGCAACCTGCTAGTCCTTGGCAGACTCAG CTA AGTA CT TCCT [ATCT]11 A [ATCT]3 T [CTAT]2 GAAG GCAG TTAC TGTT AATA TCTT CATT TTAC AGGT AGGA AAAC TGAG ACAC AGGG TGGT TAG CA ACCT GCTA GTCC TTGG CAGA CTCA G +2800M_strresults_filtered NA NA AMELOGENIN 14189 GGTAATTTTTCTCTTTACTAATTTTGACCATTGTTTGCGTTAACAATGCCCTGGGCTCTGTAAAGAATAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAG TCAGCTATGAGGTAATTTTTCTCTTTACTAATTTTGACCATTGTTTGCGTTAACAATGCCCTGGGCTCTGTAAAGAATAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAGTGGTCCTGATTTTACAGTTCCTACCACCAGCTTCCCA T C A G C T A T G A GGTAATTTTTCTCTTTACTAATTTTGACCATTGTTTGCGTTAACAATGCCCTGGGCTCTGTAAAGAATAGTGTGTTGATTCTTTATCCCAGATGTTTCTCAAG T [G]2 T [C]2 T G A [T]4 A C A G [T]2 [C]2 T A [C]2 A [C]2 A G C [T]2 [C]3 A Possible indel or partial sequence +2800M_strresults_filtered NA NA AMELOGENIN 11986 GGTAATTTTTCTCTTTACTAATTTTGATCACTGTTTGCATTAGCAGTCCCCTGGGCTCTGTAAAGAATAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAG TCAGCTATGAGGTAATTTTTCTCTTTACTAATTTTGATCACTGTTTGCATTAGCAGTCCCCTGGGCTCTGTAAAGAATAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAGTGGTCCCAATTTTACAGTTCCTACCATCAGCTTCCCA T C A G C T A T G A GGTAATTTTTCTCTTTACTAATTTTGATCACTGTTTGCATTAGCAGTCCCCTGGGCTCTGTAAAGAATAGTGGGTGGATTCTTCATCCCAAATAAAGTGGTTTCTCAAG T [G]2 T [C]3 [A]2 [T]4 A C A G [T]2 [C]2 T A [C]2 A T C A G C [T]2 [C]3 A Possible indel or partial sequence diff --git a/lusSTR/tests/data/lusstr_output.txt b/lusSTR/tests/data/lusstr_output.txt index d86c37dd..852445e8 100644 --- a/lusSTR/tests/data/lusstr_output.txt +++ b/lusSTR/tests/data/lusstr_output.txt @@ -127,3 +127,5 @@ Positive_Control Project1 Analysis1 D10S1248 GGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGA Positive_Control Project1 Analysis1 CSF1PO AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT ATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT [AGAT]12 [ATCT]12 12 12_12 12_12_0 702 Positive_Control Project1 Analysis1 CSF1PO AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT ATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT [AGAT]11 [ATCT]11 11 11_11 11_11_0 29 Positive_Control Project1 Analysis1 CSF1PO AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT ATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT [AGAT]13 [ATCT]13 13 13_13 13_13_0 11 +Positive_Control Project1 Analysis1 AMELOGENIN AAAGTG AAAGTG AAAGTG AAAGTG Y NA NA 283 +Positive_Control Project1 Analysis1 AMELOGENIN NA X NA NA 143 diff --git a/lusSTR/tests/data/lusstr_output/Positive_Control_evidence_ngs.csv b/lusSTR/tests/data/lusstr_output/Positive_Control_evidence_ngs.csv index ab397eff..f5901b88 100644 --- a/lusSTR/tests/data/lusstr_output/Positive_Control_evidence_ngs.csv +++ b/lusSTR/tests/data/lusstr_output/Positive_Control_evidence_ngs.csv @@ -1,6 +1,6 @@ Locus,CE Allele,Allele Seq,Reads -CSF1PO,11.0,ATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT,29 -CSF1PO,12.0,ATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT,702 +CSF1PO,11.0,AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT,29 +CSF1PO,12.0,AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT,702 D10S1248,12.0,GGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAA,163 D10S1248,13.0,GGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAA,1050 D10S1248,14.0,GGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAA,116 @@ -23,12 +23,12 @@ D18S51,15.0,AGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAAAGAGAG D18S51,16.0,AGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAAAGAGAGAG,1009 D18S51,17.0,AGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAAAGAGAGAG,165 D18S51,18.0,AGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAAAGAGAGAG,999 -D19S433,12.0,CTCTCTTTCTTCCTCTCTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTACCTTCTTTCCTT,122 -D19S433,13.0,CTCTCTTTCTTCCTCTCTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTACCTTCTTTCCTT,1782 -D19S433,14.0,CTCTCTTTCTTCCTCTCTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTACCTTCTTTCCTT,1621 -D1S1656,11.0,CACACACACACCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,33 -D1S1656,12.0,CACACACACACCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,231 -D1S1656,13.0,CACACACACATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,162 +D19S433,12.0,AAGGAAAGAAGGTAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAGAGAGGAAGAAAGAGAG,122 +D19S433,13.0,AAGGAAAGAAGGTAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAGAGAGGAAGAAAGAGAG,1782 +D19S433,14.0,AAGGAAAGAAGGTAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAGAGAGGAAGAAAGAGAG,1621 +D1S1656,11.0,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGGTGTGTGTGTG,33 +D1S1656,12.0,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGGTGTGTGTGTG,231 +D1S1656,13.0,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATGTGTGTGTG,162 D20S482,13.0,AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT,337 D20S482,14.0,AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT,3136 D20S482,15.0,AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT,2731 @@ -37,10 +37,10 @@ D21S11,29.0,TCTATCTATCTATCTATCTGTCTGTCTGTCTGTCTGTCTGTCTATCTATCTATATCTATCTATCTATC D21S11,31.2,TCTATCTATCTATCTATCTATCTGTCTGTCTGTCTGTCTGTCTGTCTATCTATCTATATCTATCTATCTATCATCTATCTATCCATATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATATCTA,1064 D22S1045,15.0,ATTATTATTATTATTATTATTATTATTATTATTATTACTATTATT,146 D22S1045,16.0,ATTATTATTATTATTATTATTATTATTATTATTATTATTACTATTATT,1746 -D2S1338,21.0,GGAAGGAAGGACGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGCAGGCAGGCAGGCAGGCAGGCAGGCA,246 -D2S1338,22.0,GGAAGGAAGGACGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGCAGGCAGGCAGGCAGGCAGGCAGGCA,2165 -D2S1338,24.0,GGAAGGAAGGACGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGCAGGCAGGCAGGCAGGCAGGCAGGCA,259 -D2S1338,25.0,GGAAGGAAGGACGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGCAGGCAGGCAGGCAGGCAGGCAGGCA,1656 +D2S1338,21.0,TGCCTGCCTGCCTGCCTGCCTGCCTGCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCGTCCTTCCTTCC,246 +D2S1338,22.0,TGCCTGCCTGCCTGCCTGCCTGCCTGCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCGTCCTTCCTTCC,2165 +D2S1338,24.0,TGCCTGCCTGCCTGCCTGCCTGCCTGCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCGTCCTTCCTTCC,259 +D2S1338,25.0,TGCCTGCCTGCCTGCCTGCCTGCCTGCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCTTCCGTCCTTCCTTCC,1656 D2S441,10.0,TCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,1781 D2S441,14.0,TCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATTTATCTATCTA,1330 D3S1358,16.0,TCTATCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,135 @@ -48,15 +48,15 @@ D3S1358,17.0,TCTATCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT D3S1358,18.0,TCTATCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,1855 D4S2408,8.0,ATCTATCTATCTATCTATCTATCTATCTATCT,38 D4S2408,9.0,ATCTATCTATCTATCTATCTATCTATCTATCTATCT,1357 -D5S818,11.0,CTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT,21 -D5S818,12.0,CTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT,382 -D6S1043,11.0,ATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT,219 -D6S1043,12.0,ATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT,2088 -D6S1043,19.0,ATCTATCTATCTATCTATCTATGTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT,138 -D6S1043,20.0,ATCTATCTATCTATCTATCTATGTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCT,1487 -D7S820,8.0,AAACTATCAATCTGTCTATCTATCTATCTATCTATCTATCTATCTATC,517 -D7S820,10.0,AAACTATCAATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATC,33 -D7S820,11.0,AAACTATCAATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATC,522 +D5S818,11.0,AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAG,21 +D5S818,12.0,AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAG,382 +D6S1043,11.0,AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT,219 +D6S1043,12.0,AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT,2088 +D6S1043,19.0,AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATACATAGATAGATAGATAGATAGAT,138 +D6S1043,20.0,AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATACATAGATAGATAGATAGATAGAT,1487 +D7S820,8.0,GATAGATAGATAGATAGATAGATAGATAGATAGACAGATTGATAGTTT,517 +D7S820,10.0,GATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGATTGATAGTTT,33 +D7S820,11.0,GATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGATTGATAGTTT,522 D8S1179,13.0,TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,135 D8S1179,14.0,TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,971 D8S1179,15.0,TCTATCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA,661 @@ -64,21 +64,21 @@ D9S1122,11.0,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,131 D9S1122,11.0,TAGATCGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,115 D9S1122,12.0,TAGATCGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,1551 D9S1122,12.0,TAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGA,1427 -FGA,19.0,GGAAGGAAGGAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAAAGAAAGAAAGAAA,135 -FGA,20.0,GGAAGGAAGGAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAAAGAAAGAAAGAAA,1043 -FGA,22.0,GGAAGGAAGGAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAAAGAAAGAAAGAAA,182 -FGA,23.0,GGAAGGAAGGAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAGAAAAAAGAAAGAAAGAAA,1023 +FGA,19.0,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,135 +FGA,20.0,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,1043 +FGA,22.0,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,182 +FGA,23.0,TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC,1023 PentaD,12.0,AAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,289 PentaD,13.0,AAAAGAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,278 -PentaE,7.0,TCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTT,505 -PentaE,14.0,TCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTTTCTTT,261 +PentaE,7.0,AAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,505 +PentaE,14.0,AAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGAAAAGA,261 TH01,5.0,AATGAATGAATGAATGAATG,247 TH01,6.0,AATGAATGAATGAATGAATGAATG,4542 TH01,8.3,AATGAATGAATGAATGAATGATGAATGAATGAATG,151 TH01,9.3,AATGAATGAATGAATGAATGAATGATGAATGAATGAATG,3581 TPOX,10.0,AATGAATGAATGAATGAATGAATGAATGAATGAATGAATG,59 TPOX,11.0,AATGAATGAATGAATGAATGAATGAATGAATGAATGAATGAATG,1216 -vWA,15.0,TAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGATAGA,23 -vWA,16.0,TAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGATAGA,377 -vWA,18.0,TAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGA,43 -vWA,19.0,TAGATGGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGACAGACAGACAGACAGATAGA,325 +vWA,15.0,TCTATCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCCATCTA,23 +vWA,16.0,TCTATCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCCATCTA,377 +vWA,18.0,TCTATCTGTCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCCATCTA,43 +vWA,19.0,TCTATCTGTCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCCATCTA,325 diff --git a/lusSTR/wrappers/convert.py b/lusSTR/wrappers/convert.py index 8115c09f..47599c80 100644 --- a/lusSTR/wrappers/convert.py +++ b/lusSTR/wrappers/convert.py @@ -74,7 +74,9 @@ def format_table(input, software, kit="forenseq", custom=False): len(sequence) <= (remove_5p + remove_3p + len(metadata["LUS"])) and software != "uas" and locus != "AMELOGENIN" - ) or (locus == "AMELOGENIN" and len(sequence) < (remove_5p + remove_3p)): + ) or ( + software != "uas" and locus == "AMELOGENIN" and len(sequence) < (remove_5p + remove_3p) + ): flank_summary = [ sampleid, project, diff --git a/lusSTR/wrappers/filter.py b/lusSTR/wrappers/filter.py index 37bbb7eb..3869c5e6 100644 --- a/lusSTR/wrappers/filter.py +++ b/lusSTR/wrappers/filter.py @@ -119,15 +119,11 @@ "DYS438", "DYS439", "DYS448", - "DYS460", "DYS481", - "DYS505", - "DYS522", "DYS533", "DYS549", "DYS570", "DYS576", - "DYS612", "DYS635", "DYS643", "Y-GATA-H4", From 1f03738016bccb478900f0dca201f9a7af296a62 Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Wed, 4 Jun 2025 06:10:46 -0400 Subject: [PATCH 19/21] fixed formatting issues; added str lists as json file --- lusSTR/cli/gui.py | 87 ++++++----------------- lusSTR/data/str_lists.json | 101 ++++++++++++++++++++++++++ lusSTR/wrappers/filter.py | 141 +++++++------------------------------ 3 files changed, 148 insertions(+), 181 deletions(-) create mode 100644 lusSTR/data/str_lists.json diff --git a/lusSTR/cli/gui.py b/lusSTR/cli/gui.py index bcd3a005..29b95ffb 100644 --- a/lusSTR/cli/gui.py +++ b/lusSTR/cli/gui.py @@ -44,6 +44,14 @@ def get_filter_metadata_file(): filter_marker_data = json.load(fh) +def get_strlist_file(): + return importlib.resources.files("lusSTR") / "data/str_lists.json" + + +with open(get_strlist_file(), "r") as fh: + str_lists = json.load(fh) + + # ------------ Function to Generate config.yaml File ---------- # @@ -147,63 +155,6 @@ def main(): # lusSTR Home Page # ##################################################################### -p_strs = [ - "AMELOGENIN", - "CSF1PO", - "D10S1248", - "D12S391", - "D13S317", - "D16S539", - "D18S51", - "D19S433", - "D1S1656", - "D21S11", - "D22S1045", - "D2S1338", - "D2S441", - "D3S1358", - "D5S818", - "D7S820", - "D8S1179", - "FGA", - "PENTA D", - "PENTA E", - "TH01", - "TPOX", - "VWA", -] - -f_strs = [ - "AMELOGENIN", - "CSF1PO", - "D10S1248", - "D12S391", - "D13S317", - "D16S539", - "D17S1301", - "D18S51", - "D19S433", - "D1S1656", - "D20S482", - "D21S11", - "D22S1045", - "D2S1338", - "D2S441", - "D3S1358", - "D4S2408", - "D5S818", - "D6S1043", - "D7S820", - "D8S1179", - "D9S1122", - "FGA", - "PENTA D", - "PENTA E", - "TH01", - "TPOX", - "VWA", -] - def show_home_page(): @@ -255,15 +206,21 @@ def interactive_plots_allmarkers(sample_df, flagged_df): max_yvalue = (int(math.ceil(max_reads / n)) * n) + n increase_value = int(math.ceil((max_yvalue / 5) / n)) * n n = 0 - all_loci = f_strs if st.session_state.kit == "forenseq" else p_strs + all_loci = ( + str_lists["forenseq_strs"] + if st.session_state.kit == "forenseq" + else str_lists["powerseq_strs"] + ) missing_loci = [x for x in all_loci if x not in sample_df["Locus"].unique()] for marker in all_loci: col = cols[n] container = col.container(border=True) sample_locus = sample_df["SampleID"].unique() + "_" + marker - for i, row in sample_df.iterrows(): - if sample_df.loc[i, "Locus"] == "AMELOGENIN": - sample_df.loc[i, "CE_Allele"] = 0 if sample_df.loc[i, "CE_Allele"] == "X" else 1 + sample_df = np.where( + sample_df["Locus"] == "AMELOGENIN", + np.where(sample_df["CE_Allele"] == "X", 0, 1), + sample_df["CE_Allele"], + ) sample_df["CE_Allele"] = pd.to_numeric(sample_df["CE_Allele"]) marker_df = sample_df[sample_df["Locus"] == marker].sort_values( by=["CE_Allele", "allele_type"], ascending=[False, True] @@ -384,9 +341,11 @@ def interactive_setup(df1, file): interactive_plots_allmarkers(sample_df, flags) else: plot_df = sample_df - for i, row in plot_df.iterrows(): - if plot_df.loc[i, "Locus"] == "AMELOGENIN": - plot_df.loc[i, "CE_Allele"] = 0 if plot_df.loc[i, "CE_Allele"] == "X" else 1 + sample_df = np.where( + sample_df["Locus"] == "AMELOGENIN", + np.where(sample_df["CE_Allele"] == "X", 0, 1), + sample_df["CE_Allele"], + ) plot_df["CE_Allele"] = pd.to_numeric(plot_df["CE_Allele"]) locus_key = f"{sample}_{locus}" if locus_key not in st.session_state: diff --git a/lusSTR/data/str_lists.json b/lusSTR/data/str_lists.json new file mode 100644 index 00000000..e1d54894 --- /dev/null +++ b/lusSTR/data/str_lists.json @@ -0,0 +1,101 @@ +{ + + "powerseq_strs" : [ + "AMELOGENIN", + "CSF1PO", + "D10S1248", + "D12S391", + "D13S317", + "D16S539", + "D18S51", + "D19S433", + "D1S1656", + "D21S11", + "D22S1045", + "D2S1338", + "D2S441", + "D3S1358", + "D5S818", + "D7S820", + "D8S1179", + "FGA", + "PENTA D", + "PENTA E", + "TH01", + "TPOX", + "VWA" + ], + "forenseq_strs" : [ + "AMELOGENIN", + "CSF1PO", + "D10S1248", + "D12S391", + "D13S317", + "D16S539", + "D17S1301", + "D18S51", + "D19S433", + "D1S1656", + "D20S482", + "D21S11", + "D22S1045", + "D2S1338", + "D2S441", + "D3S1358", + "D4S2408", + "D5S818", + "D6S1043", + "D7S820", + "D8S1179", + "D9S1122", + "FGA", + "PENTA D", + "PENTA E", + "TH01", + "TPOX", + "VWA" + ], + "powerseq_ystrs" : [ + "DYS19", + "DYS385A-B", + "DYS389II", + "DYS390", + "DYS391", + "DYS392", + "DYS393", + "DYS437", + "DYS438", + "DYS439", + "DYS448", + "DYS456", + "DYS458", + "DYS481", + "DYS533", + "DYS549", + "DYS570", + "DYS576", + "DYS635", + "DYS643", + "Y-GATA-H4" + ], + "forenseq_ystrs" : [ + "DYS19", + "DYS385A-B", + "DYS389II", + "DYS390", + "DYS391", + "DYS392", + "DYS437", + "DYS438", + "DYS439", + "DYS448", + "DYS481", + "DYS533", + "DYS549", + "DYS570", + "DYS576", + "DYS635", + "DYS643", + "Y-GATA-H4" + ] +} \ No newline at end of file diff --git a/lusSTR/wrappers/filter.py b/lusSTR/wrappers/filter.py index 3869c5e6..3b3c7981 100644 --- a/lusSTR/wrappers/filter.py +++ b/lusSTR/wrappers/filter.py @@ -27,109 +27,6 @@ import sys -p_strs = [ - "AMELOGENIN", - "CSF1PO", - "D10S1248", - "D12S391", - "D13S317", - "D16S539", - "D18S51", - "D19S433", - "D1S1656", - "D21S11", - "D22S1045", - "D2S1338", - "D2S441", - "D3S1358", - "D5S818", - "D7S820", - "D8S1179", - "FGA", - "PENTA D", - "PENTA E", - "TH01", - "TPOX", - "VWA", -] - -f_strs = [ - "AMELOGENIN", - "CSF1PO", - "D10S1248", - "D12S391", - "D13S317", - "D16S539", - "D17S1301", - "D18S51", - "D19S433", - "D1S1656", - "D20S482", - "D21S11", - "D22S1045", - "D2S1338", - "D2S441", - "D3S1358", - "D4S2408", - "D5S818", - "D6S1043", - "D7S820", - "D8S1179", - "D9S1122", - "FGA", - "PENTA D", - "PENTA E", - "TH01", - "TPOX", - "VWA", -] - -p_ystrs = [ - "DYS19", - "DYS385A-B", - "DYS389II", - "DYS390", - "DYS391", - "DYS392", - "DYS393", - "DYS437", - "DYS438", - "DYS439", - "DYS448", - "DYS456", - "DYS458", - "DYS481", - "DYS533", - "DYS549", - "DYS570", - "DYS576", - "DYS635", - "DYS643", - "Y-GATA-H4", -] - -f_ystrs = [ - "DYS19", - "DYS385A-B", - "DYS389II", - "DYS390", - "DYS391", - "DYS392", - "DYS437", - "DYS438", - "DYS439", - "DYS448", - "DYS481", - "DYS533", - "DYS549", - "DYS570", - "DYS576", - "DYS635", - "DYS643", - "Y-GATA-H4", -] - - def get_filter_metadata_file(): return importlib.resources.files("lusSTR") / "data/filters.json" @@ -138,11 +35,19 @@ def get_filter_metadata_file(): filter_marker_data = json.load(fh) +def get_strlist_file(): + return importlib.resources.files("lusSTR") / "data/str_lists.json" + + +with open(get_strlist_file(), "r") as fh: + str_lists = json.load(fh) + + def process_strs(dict_loc, datatype, seq_col, brack_col, kit): final_df = pd.DataFrame() flags_df = pd.DataFrame() - strs = p_strs if kit == "powerseq" else f_strs - ystrs = p_ystrs if kit == "powerseq" else f_ystrs + strs = str_lists["powerseq_strs"] if kit == "powerseq" else str_lists["forenseq_strs"] + ystrs = str_lists["powerseq_ystrs"] if kit == "powerseq" else str_lists["forenseq_ystrs"] for key, value in dict_loc.items(): data = dict_loc[key].reset_index(drop=True) if datatype == "ce": @@ -234,8 +139,8 @@ def populate_efm_profile(profile, data_type, colname, sex, kit): allele_heights[row.SampleID][row.Locus][row.Allele] = int(row.Reads) max_num_alleles = determine_max_num_alleles(allele_heights) reformatted_profile = list() - strs = p_strs if kit == "powerseq" else f_strs - ystrs = p_ystrs if kit == "powerseq" else f_ystrs + strs = str_lists["powerseq_strs"] if kit == "powerseq" else str_lists["forenseq_strs"] + ystrs = str_lists["powerseq_ystrs"] if kit == "powerseq" else str_lists["forenseq_ystrs"] for sampleid, loci in allele_heights.items(): for locusid, alleles in loci.items(): allele_list, height_list = list(), list() @@ -442,9 +347,13 @@ def make_plot(df, sample_id, output_name, kit, sameyaxis=False, filters=False, a fig = plt.figure(figsize=(30, 30)) n = 0 if kit == "powerseq": - str_list = p_ystrs if "sexloci" in output_name else p_strs + str_list = ( + str_lists["powerseq_ystrs"] if "sexloci" in output_name else str_lists["powerseq_strs"] + ) else: - str_list = f_ystrs if "sexloci" in output_name else f_strs + str_list = ( + str_lists["forenseq_ystrs"] if "sexloci" in output_name else str_lists["forenseq_strs"] + ) for marker in str_list: n += 1 colors = {"Typed": "green", "Stutter": "blue", "BelowAT": "red", "Deleted": "purple"} @@ -457,7 +366,6 @@ def make_plot(df, sample_id, output_name, kit, sameyaxis=False, filters=False, a 0 if marker_df.loc[i, "CE_Allele"] == "X" else 1 ) marker_df["CE_Allele"] = marker_df["CE_Allele"].astype(float) - # ax = fig.add_subplot(6, 5, n) p = ax.bar( marker_df["CE_Allele"], marker_df["Reads"], @@ -478,16 +386,15 @@ def make_plot(df, sample_id, output_name, kit, sameyaxis=False, filters=False, a if not filters: plt.legend(handles, labels, title="Allele Type") else: + marker_df["Label"] = None for i, row in marker_df.iterrows(): if marker == "AMELOGENIN": - marker_df.loc[i, "Label"] = ( - "X" if marker_df.loc[i, "CE_Allele"] == 0 else "Y" - ) + row["Label"] = "X" if row["CE_Allele"] == 0 else "Y" else: - marker_df.loc[i, "Label"] = ( - str(int(marker_df.loc[i, "CE_Allele"])) - if ".0" in str(marker_df.loc[i, "CE_Allele"]) - else str(marker_df.loc[i, "CE_Allele"]) + row["Label"] = ( + str(int(row["CE_Allele"])) + if ".0" in str(row["CE_Allele"]) + else str(row["CE_Allele"]) ) ax.bar_label(p, labels=marker_df["Label"]) if sameyaxis: From ac814c78b9874bd71923ede6a47a8b01293ca25d Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Wed, 4 Jun 2025 07:16:39 -0400 Subject: [PATCH 20/21] fixed bug --- lusSTR/wrappers/filter.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lusSTR/wrappers/filter.py b/lusSTR/wrappers/filter.py index 3b3c7981..812713e2 100644 --- a/lusSTR/wrappers/filter.py +++ b/lusSTR/wrappers/filter.py @@ -386,12 +386,11 @@ def make_plot(df, sample_id, output_name, kit, sameyaxis=False, filters=False, a if not filters: plt.legend(handles, labels, title="Allele Type") else: - marker_df["Label"] = None for i, row in marker_df.iterrows(): if marker == "AMELOGENIN": - row["Label"] = "X" if row["CE_Allele"] == 0 else "Y" + marker_df.loc[i, "Label"] = "X" if row["CE_Allele"] == 0 else "Y" else: - row["Label"] = ( + marker_df.loc[i, "Label"] = ( str(int(row["CE_Allele"])) if ".0" in str(row["CE_Allele"]) else str(row["CE_Allele"]) From ae3c813da8cd606ec82b1c7830553983de57b5bd Mon Sep 17 00:00:00 2001 From: rnmitchell Date: Thu, 5 Jun 2025 12:07:17 -0400 Subject: [PATCH 21/21] simplified convert code --- lusSTR/wrappers/convert.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/lusSTR/wrappers/convert.py b/lusSTR/wrappers/convert.py index 47599c80..49a7062d 100644 --- a/lusSTR/wrappers/convert.py +++ b/lusSTR/wrappers/convert.py @@ -70,13 +70,10 @@ def format_table(input, software, kit="forenseq", custom=False): remove_5p = remove_5p - metadata["Custom_5"] if metadata["Custom_3"] < 0: remove_3p = remove_3p - metadata["Custom_3"] - if ( - len(sequence) <= (remove_5p + remove_3p + len(metadata["LUS"])) - and software != "uas" - and locus != "AMELOGENIN" - ) or ( - software != "uas" and locus == "AMELOGENIN" and len(sequence) < (remove_5p + remove_3p) - ): + locus_min_length = remove_5p + remove_3p + len(metadata["LUS"]) + if locus == "AMELOGENIN": + locus_min_length -= 1 + if software != "uas" and len(sequence) < locus_min_length: flank_summary = [ sampleid, project,